diff --git a/source/common/avs2_defs.h b/source/common/avs2_defs.h
index 7a26160..1926068 100644
--- a/source/common/avs2_defs.h
+++ b/source/common/avs2_defs.h
@@ -256,7 +256,7 @@ enum sao_class_e {
 #define XAVS2_MIN3(a, b, c)   XAVS2_MIN((a), XAVS2_MIN((b),(c)))
 #define XAVS2_MAX3(a, b, c)   XAVS2_MAX((a), XAVS2_MAX((b),(c)))
 
-#define XAVS2_CLIP1(a)        ((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a)))
+//#define XAVS2_CLIP1(a)        ((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a)))
 #define XAVS2_CLIP3F(L, H, v) (((v) < (L)) ? (L) : (((v) > (H)) ? (H) : (v)))
 #define XAVS2_CLIP3(L, H, v)  xavs2_clip3(L, H, v)
 #define XAVS2_ABS(A)          ((A) < 0 ? (-(A)) : (A))    // abs macro, faster than procedure
@@ -281,9 +281,9 @@ static void XAVS2_SWAP_PTR(T *&x, T *&y)
  * global variables
  * ===========================================================================
  */
-static const int g_bit_depth   = BIT_DEPTH;
-static const int max_pel_value = (1 << BIT_DEPTH) - 1;
-static const int g_dc_value    = (1 << BIT_DEPTH) >> 1;
+//static const int g_bit_depth   = BIT_DEPTH;
+//static const int max_pel_value = (1 << BIT_DEPTH) - 1;
+//static const int g_dc_value    = (1 << BIT_DEPTH) >> 1;
 
 /**
  * ===========================================================================
@@ -291,10 +291,10 @@ static const int g_dc_value    = (1 << BIT_DEPTH) >> 1;
  * ===========================================================================
  */
 
-static ALWAYS_INLINE pel_t xavs2_clip_pixel(int x)
+/*static ALWAYS_INLINE pel_t xavs2_clip_pixel(int x)
 {
     return (pel_t)((x & ~max_pel_value) ? (-x) >> 31 & max_pel_value : x);
-}
+}*/
 
 static ALWAYS_INLINE int xavs2_clip3(int i_min, int i_max, int v)
 {
@@ -323,19 +323,19 @@ static ALWAYS_INLINE int xavs2_median(int a, int b, int c)
     return b;
 }
 
-// ·µ»ØÊýÖµµÄ·ûºÅÎ»£¬¸ºÊý·µ»Ø-1£¬·ñÔò·µ»Ø1
+// è¿”å›žæ•°å€¼çš„ç¬¦å·ä½ï¼Œè´Ÿæ•°è¿”å›ž-1ï¼Œå¦åˆ™è¿”å›ž1
 static ALWAYS_INLINE int xavs2_sign2(int val)
 {
     return ((val >> 31) << 1) + 1;
 }
 
-// ·µ»ØÊýÖµµÄ·ûºÅÎ»£¬¸ºÊý·µ»Ø-1£¬0Öµ·µ»Ø0£¬ÕýÊý·µ»Ø1
+// è¿”å›žæ•°å€¼çš„ç¬¦å·ä½ï¼Œè´Ÿæ•°è¿”å›ž-1ï¼Œ0å€¼è¿”å›ž0ï¼Œæ­£æ•°è¿”å›ž1
 static ALWAYS_INLINE int xavs2_sign3(int val)
 {
     return (val >> 31) | (int)(((uint32_t)-val) >> 31u);
 }
 
-// ¼ÆËãÕýÕûÊýµÄlog2Öµ£¬0ºÍ1Ê±·µ»Ø0£¬ÆäËû·µ»Ølog2(val)
+// è®¡ç®—æ­£æ•´æ•°çš„log2å€¼ï¼Œ0å’Œ1æ—¶è¿”å›ž0ï¼Œå…¶ä»–è¿”å›žlog2(val)
 #define xavs2_log2u(val)  xavs2_ctz(val)
 
 
diff --git a/source/common/basic_types.h b/source/common/basic_types.h
index 59d9127..a738055 100644
--- a/source/common/basic_types.h
+++ b/source/common/basic_types.h
@@ -47,11 +47,16 @@
  * basic types
  * ===========================================================================
  */
-typedef uint8_t     pel_t;      /* type for pixel */
-typedef int16_t     itr_t;      /* intra prediction temp */
+
+typedef uint16_t    pel10_t;      /* type for pixel value */
+typedef uint64_t    pixel10_4;     /* type for 4-pixels value */
+typedef int32_t     itr10_t;      /* intra prediction temp */
+typedef uint8_t     pel8_t;      /* type for pixel value */
+typedef uint32_t    pixel8_4;     /* type for 4-pixels value */
+typedef int16_t     itr8_t;      /* intra prediction temp */
+
 typedef uint16_t    sum_t;
 typedef uint32_t    sum2_t;
-typedef uint32_t    pixel4;
 typedef int32_t     ssum2_t;    /* Signed sum */
 typedef int32_t     dist_t;
 
diff --git a/source/common/common.h b/source/common/common.h
index ab91348..14462d1 100644
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -106,18 +106,26 @@
 /* ---------------------------------------------------------------------------
  * memory malloc
  */
-#define CHECKED_MALLOC(var, type, size) \
+#define CHECKED_MALLOC8(var, type, size) \
     MULTI_LINE_MACRO_BEGIN\
     (var) = (type)xavs2_malloc(size);\
     if ((var) == NULL) {\
-        goto fail;\
+        goto fail8;\
+    }\
+    MULTI_LINE_MACRO_END
+
+#define CHECKED_MALLOC10(var, type, size) \
+    MULTI_LINE_MACRO_BEGIN\
+    (var) = (type)xavs2_malloc(size);\
+    if ((var) == NULL) {\
+        goto fail10;\
     }\
     MULTI_LINE_MACRO_END
 
 #define CHECKED_MALLOCZERO(var, type, size) \
     MULTI_LINE_MACRO_BEGIN\
     size_t new_size = ((size + 31) >> 5) << 5; /* align the size to 32 bytes */ \
-    CHECKED_MALLOC(var, type, new_size);\
+    CHECKED_MALLOC8(var, type, new_size);\
     g_funcs.memzero_aligned(var, new_size); \
     MULTI_LINE_MACRO_END
 
@@ -392,7 +400,7 @@ typedef union runlevel_pair_t {
 
 /* ---------------------------------------------------------------------------
  * run-level infos (CG: Coefficient Group)
- * ìØ±àÂë¹ý³ÌÖÐ×î´óµÄ±ä»»¿éÎª 32x32£¬×î¶à 8*8 ¸öCG
+ * ç†µç¼–ç è¿‡ç¨‹ä¸­æœ€å¤§çš„å˜æ¢å—ä¸º 32x32ï¼Œæœ€å¤š 8*8 ä¸ªCG
  */
 typedef struct runlevel_t {
     ALIGN16(runlevel_pair_t runlevels_cg[16]);
@@ -411,7 +419,7 @@ typedef struct runlevel_t {
  * binary_t
  */
 typedef struct binary_t {
-    /* Óï·¨ÔªËØ±àÂëÓÃº¯ÊýÖ¸Õë */
+    /* è¯­æ³•å…ƒç´ ç¼–ç ç”¨å‡½æ•°æŒ‡é’ˆ */
     int (*write_intra_pred_mode)(aec_t *p_aec, int ipmode);
     int (*write_ctu_split_flag)(aec_t *p_aec, int i_cu_split, int i_cu_level);
     int (*est_cu_header)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu);
@@ -464,14 +472,14 @@ typedef struct binary_t {
 #define NUM_LAST_CG_CTX_CHROMA  6
 #define NUM_SIGN_CG_CTX_LUMA    2
 #define NUM_SIGN_CG_CTX_CHROMA  1
-#define NUM_LAST_POS_CTX_LUMA   48    /* last_coeff_pos_x ºÍ last_coeff_pos_y ¹²¼ÆÓÐ48¸öÉ«¶È·ÖÁ¿ÉÏÏÂÎÄ */
-#define NUM_LAST_POS_CTX_CHROMA 12    /* last_coeff_pos_x ºÍ last_coeff_pos_y ¹²¼ÆÓÐ12¸öÉ«¶È·ÖÁ¿ÉÏÏÂÎÄ */
+#define NUM_LAST_POS_CTX_LUMA   48    /* last_coeff_pos_x å’Œ last_coeff_pos_y å…±è®¡æœ‰48ä¸ªè‰²åº¦åˆ†é‡ä¸Šä¸‹æ–‡ */
+#define NUM_LAST_POS_CTX_CHROMA 12    /* last_coeff_pos_x å’Œ last_coeff_pos_y å…±è®¡æœ‰12ä¸ªè‰²åº¦åˆ†é‡ä¸Šä¸‹æ–‡ */
 
 #define NUM_MAP_CTX             12
 #define NUM_LAST_CG_CTX         (NUM_LAST_CG_CTX_LUMA  + NUM_LAST_CG_CTX_CHROMA)   /* last_cg_pos:6; + last_cg0_flag:2(IsChroma); last_cg_x:2; last_cg_y:2 */
 #define NUM_SIGN_CG_CTX         (NUM_SIGN_CG_CTX_LUMA  + NUM_SIGN_CG_CTX_CHROMA)
 #define NUM_LAST_POS_CTX        (NUM_LAST_POS_CTX_LUMA + NUM_LAST_POS_CTX_CHROMA)  /* last_coeff_pos_x: (30) + last_coeff_pos_y: (30) */
-#define NUM_COEFF_LEVEL_CTX     40    /* CoeffLevelMinus1Band Îª 0 Ê± coeff_level_minus1_pos_in_band */
+#define NUM_COEFF_LEVEL_CTX     40    /* CoeffLevelMinus1Band ä¸º 0 æ—¶ coeff_level_minus1_pos_in_band */
 
 #define NUM_SAO_MERGE_FLAG_CTX  3
 #define NUM_SAO_MODE_CTX        1
@@ -682,27 +690,27 @@ typedef struct ctx_set_t {
     context_t pu_reference_index            [NUM_REF_NO_CTX        ];
     context_t cbp_contexts                  [NUM_CBP_CTX           ];
     context_t mvd_contexts               [2][NUM_MVD_CTX           ];
-    /* Ö¡¼äÔ¤²â */
+    /* å¸§é—´é¢„æµ‹ */
     context_t pu_type_index                 [NUM_INTER_DIR_CTX     ];    // b_pu_type_index[15] = f_pu_type_index[3] + dir_multi_hypothesis_mode[12]
     context_t b_pu_type_min_index           [NUM_INTER_DIR_MIN_CTX ];
     // b_pu_type_index2 // for B_NxN
     // f_pu_type_index2 // for F_NxN
-    context_t cu_subtype_index              [DS_MAX_NUM            ];  // B_Skip/B_Direct, F_Skip/F_Direct ¹«ÓÃ
+    context_t cu_subtype_index              [DS_MAX_NUM            ];  // B_Skip/B_Direct, F_Skip/F_Direct å…¬ç”¨
     context_t weighted_skip_mode            [WPM_NUM               ];
-    /* Ö¡ÄÚÔ¤²â */
+    /* å¸§å†…é¢„æµ‹ */
     context_t intra_luma_pred_mode          [NUM_INTRA_MODE_CTX    ];
     context_t intra_chroma_pred_mode        [NUM_INTRA_MODE_C_CTX  ];
-    /* CU ¼¶±ðQPµ÷Õû */
+    /* CU çº§åˆ«QPè°ƒæ•´ */
 #if ENABLE_RATE_CONTROL_CU
     context_t delta_qp_contexts             [NUM_DELTA_QP_CTX      ];
 #endif
-    /* ±ä»»ÏµÊý±àÂë */
+    /* å˜æ¢ç³»æ•°ç¼–ç  */
     context_t coeff_run [2][NUM_BLOCK_TYPES][NUM_MAP_CTX           ];  // [0:Luma, 1:Chroma][rank][ctx_idx]
     context_t nonzero_cg_flag               [NUM_SIGN_CG_CTX       ];
     context_t last_cg_contexts              [NUM_LAST_CG_CTX       ];
     context_t last_pos_contexts             [NUM_LAST_POS_CTX      ];
     context_t coeff_level                   [NUM_COEFF_LEVEL_CTX   ];
-    /* ºó´¦ÀíÄ£¿é */
+    /* åŽå¤„ç†æ¨¡å— */
     context_t sao_merge_type_index          [NUM_SAO_MERGE_FLAG_CTX];
     context_t sao_mode                      [NUM_SAO_MODE_CTX      ];
     context_t sao_interval_offset_abs       [NUM_SAO_OFFSET_CTX    ];
@@ -749,7 +757,8 @@ typedef struct slice_t {
     uint8_t    *p_slice_bs_buf;       /* pointer of bitstream buffer (start address) */
 
     /* slice buffers */
-    pel_t      *slice_intra_border[3];    /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
+    pel8_t      *slice_intra_border8[3];    /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
+    pel10_t      *slice_intra_border10[3];    /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
     uint8_t    *slice_deblock_flag[2];    /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */
     int8_t     *slice_ipredmode;          /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */
 
@@ -808,7 +817,8 @@ struct cu_info_t {
     int         i_scu_x;              /* horizontal position for the first SCU in CU */
     int         i_scu_y;              /* vertical   position for the first SCU in CU */
 
-    pel_t      *p_rec[3];             /* reconstruction pixels for current cu [y/u/v] */
+    pel8_t      *p_rec8[3];             /* reconstruction pixels for current cu [y/u/v] */
+    pel10_t      *p_rec10[3];             /* reconstruction pixels for current cu [y/u/v] */
     coeff_t    *p_coeff[3];           /* residual coefficient  for current cu [y/u/v] */
 
     int8_t      i_level;              /* cu level, 3: 8x8, 4: 16x16, 5: 32x32, 6: 64x64 */
@@ -861,13 +871,13 @@ struct cu_info_t {
  * cu_mv_mode_t
  */
 typedef struct cu_mv_mode_t {
-    mv_t        all_sym_mv[1];              /* ¶Ô³ÆÄ£Ê½µÄMV */
+    mv_t        all_sym_mv[1];              /* å¯¹ç§°æ¨¡å¼çš„MV */
     mv_t        all_single_mv[MAX_REFS];
 
-    /* mvp¿ÉÒÔÖ»¶ÔÕû¸öLCUÖ»±£ÁôÒ»·Ý£¬ÎÞÐë°´ÕÕÉî¶È·Ö²ã */
+    /* mvpå¯ä»¥åªå¯¹æ•´ä¸ªLCUåªä¿ç•™ä¸€ä»½ï¼Œæ— é¡»æŒ‰ç…§æ·±åº¦åˆ†å±‚ */
     mv_t        all_mvp[MAX_REFS];          /* 1st MVP of dual hypothesis prediction mode, or Foreword of BiPrediction */
 
-    /* Ë«ÏòMVÒ²Ö»ÐèÒª±£ÁôÒ»·Ý */
+    /* åŒå‘MVä¹Ÿåªéœ€è¦ä¿ç•™ä¸€ä»½ */
     mv_t        all_dual_mv_1st[MAX_REFS];
     mv_t        all_dual_mv_2nd[MAX_REFS];
 } cu_mv_mode_t;
@@ -885,10 +895,10 @@ typedef struct cu_mc_param_t {
  * cu_mode_t
  */
 typedef struct cu_mode_t {
-    uint8_t       mv_padding1[16];          /* ±ÜÃâÔ½½ç£¬ÖÁÉÙÐè2×Ö½Ú£¬´Ë´¦Îª¶ÔÆë²¹µ½16×Ö½Ú */
+    uint8_t       mv_padding1[16];          /* é¿å…è¶Šç•Œï¼Œè‡³å°‘éœ€2å­—èŠ‚ï¼Œæ­¤å¤„ä¸ºå¯¹é½è¡¥åˆ°16å­—èŠ‚ */
     cu_mv_mode_t  mvs[MAX_INTER_MODES][4];  /* MVs for normal inter prediction */
     cu_mc_param_t best_mc;                  /* MVs to store */
-    cu_mc_param_t best_mc_tmp;              /* ÓÃÓÚËã·¨ OPT_ROUGH_PU_SEL ±£´æ¶à¸öÖ¡¼ä»®·ÖÄ£Ê½µÄ×î¼Ñ²ÎÊý£¨²»Ò»¶¨ÊÇÈ«¾Ö×îÓÅ£© */
+    cu_mc_param_t best_mc_tmp;              /* ç”¨äºŽç®—æ³• OPT_ROUGH_PU_SEL ä¿å­˜å¤šä¸ªå¸§é—´åˆ’åˆ†æ¨¡å¼çš„æœ€ä½³å‚æ•°ï¼ˆä¸ä¸€å®šæ˜¯å…¨å±€æœ€ä¼˜ï¼‰ */
 
     int8_t      ref_idx_single[4];          /* [block], preserved for DMH */
 
@@ -923,7 +933,7 @@ typedef struct cu_feature_t {
      * 2: only try current depth
      * --------------------------- */
     int        pred_split_type;         /* prediction of cu split type: 0: un-determined; 1: split; 2: not-split */
-    rdcost_t   pred_costs[MAX_PRED_MODES];  /* Ã¿ÖÖPU»®·ÖÄ£Ê½µÄ cost £¨»ùÓÚÔ¤·ÖÎöµÈ»ñÈ¡£© */
+    rdcost_t   pred_costs[MAX_PRED_MODES];  /* æ¯ç§PUåˆ’åˆ†æ¨¡å¼çš„ cost ï¼ˆåŸºäºŽé¢„åˆ†æžç­‰èŽ·å–ï¼‰ */
 } cu_feature_t;
 
 
@@ -1046,10 +1056,12 @@ struct xavs2_frame_t {
     int         i_stride[3];          /* stride for Y/U/V */
     int         i_width[3];           /* width  for Y/U/V */
     int         i_lines[3];           /* height for Y/U/V */
-    pel_t      *planes[3];            /* pointers to Y/U/V data buffer */
-    pel_t      *filtered[16];         /* pointers to interpolated luma data buffers */
-
-    pel_t      *plane_buf;
+    pel10_t      *planes10[3];            /* pointers to Y/U/V data buffer */
+    pel10_t      *filtered10[16];         /* pointers to interpolated luma data buffers */
+    pel10_t      *plane_buf10;
+    pel8_t      *planes8[3];            /* pointers to Y/U/V data buffer */
+    pel8_t      *filtered8[16];         /* pointers to interpolated luma data buffers */
+    pel8_t      *plane_buf8;
     int         size_plane_buf;
 
     /* bit stream buffer */
@@ -1106,7 +1118,8 @@ typedef struct xavs2_me_t {
     bool_t      b_search_dmh;         /* is searching for DMH mode */
 
     /* pointers */
-    pel_t         *p_fenc;            /* pointer to the current PU block in source CTU */
+    pel8_t         *p_fenc8;            /* pointer to the current PU block in source CTU */
+    pel10_t         *p_fenc10;            /* pointer to the current PU block in source CTU */
     xavs2_frame_t *p_fref_1st;        /* pointer to the current (1st) reference frame */
     xavs2_frame_t *p_fref_2nd;        /* pointer to the current  2nd  reference frame */
 
@@ -1334,7 +1347,8 @@ typedef struct cu_layer_t {
     rdcost_t         mode_rdcost[MAX_PRED_MODES];   /* min rd-cost for each mode */
     int              mask_md_res_pred;              /* available mode mask */
 
-    pel_t           *p_rec_tmp[3];    /* tmp pointers to ping-pong buffer for swapping */
+    pel8_t           *p_rec8_tmp[3];    /* tmp pointers to ping-pong buffer for swapping */
+    pel10_t           *p_rec10_tmp[3];    /* tmp pointers to ping-pong buffer for swapping */
     coeff_t         *p_coeff_tmp[3];  /* tmp pointers to ping-pong buffer for swapping */
 
     cu_info_t        cu_best;         /* best info for each cu depth */
@@ -1355,16 +1369,21 @@ typedef struct cu_layer_t {
 #define FDEC_BUF_SIZE  (FDEC_STRIDE * (MAX_CU_SIZE + MAX_CU_SIZE / 2))
 #define LCU_BUF_SIZE   (MAX_CU_SIZE * MAX_CU_SIZE)
 
-    ALIGN32(pel_t   rec_buf_y     [3][LCU_BUF_SIZE]);       /* luma   reconstruction buffer     [cur/tmp/best][] */
+    ALIGN32(pel8_t   rec8_buf_y     [3][LCU_BUF_SIZE]);       /* luma   reconstruction buffer     [cur/tmp/best][] */
+    ALIGN32(pel10_t   rec10_buf_y     [3][LCU_BUF_SIZE]);       /* luma   reconstruction buffer     [cur/tmp/best][] */
     ALIGN32(coeff_t coef_buf_y    [3][LCU_BUF_SIZE]);       /* luma   coefficient    buffer     [cur/tmp/best][] */
-    ALIGN32(pel_t   rec_buf_uv [2][3][LCU_BUF_SIZE >> 2]);  /* chroma reconstruction buffer [uv][cur/tmp/best][] */
+    ALIGN32(pel8_t   rec8_buf_uv [2][3][LCU_BUF_SIZE >> 2]);  /* chroma reconstruction buffer [uv][cur/tmp/best][] */
+    ALIGN32(pel10_t   rec10_buf_uv [2][3][LCU_BUF_SIZE >> 2]);  /* chroma reconstruction buffer [uv][cur/tmp/best][] */
     ALIGN32(coeff_t coef_buf_uv[2][3][LCU_BUF_SIZE >> 2]);  /* chroma coefficient    buffer [uv][cur/tmp/best][] */
 
     /* inter prediction buffer */
-    ALIGN32(pel_t   buf_pred_inter_luma[2][LCU_BUF_SIZE]);  /* temporary decoding buffer for inter prediction (luma) */
+    ALIGN32(pel8_t   buf_pred_inter_luma8[2][LCU_BUF_SIZE]);  /* temporary decoding buffer for inter prediction (luma) */
+    ALIGN32(pel10_t   buf_pred_inter_luma10[2][LCU_BUF_SIZE]);  /* temporary decoding buffer for inter prediction (luma) */
     /* Ping-pong buffer for inter prediction */
-    pel_t   *buf_pred_inter;        /* current inter prediction buffer */
-    pel_t   *buf_pred_inter_best;   /* backup of best inter prediction */
+    pel8_t   *buf_pred_inter8;        /* current inter prediction buffer */
+    pel10_t   *buf_pred_inter10;        /* current inter prediction buffer */
+    pel8_t   *buf_pred_inter8_best;   /* backup of best inter prediction */
+    pel10_t   *buf_pred_inter10_best;   /* backup of best inter prediction */
 } cu_layer_t;
 
 /* ---------------------------------------------------------------------------
@@ -1376,13 +1395,18 @@ typedef struct cu_parallel_t {
     ALIGN32(coeff_t coeff_bak[LCU_BUF_SIZE]);
 
     /* buffers used for inter prediction */
-    ALIGN32(pel_t   buf_pred_inter_c[LCU_BUF_SIZE >> 1]);   /* temporary decoding buffer for inter prediction (chroma) */
-    ALIGN32(pel_t   buf_pixel_temp  [LCU_BUF_SIZE]);        /* temporary pixel buffer, used for bi/dual-prediction */
+    ALIGN32(pel8_t   buf_pred_inter8_c[LCU_BUF_SIZE >> 1]);   /* temporary decoding buffer for inter prediction (chroma) */
+    ALIGN32(pel10_t   buf_pred_inter10_c[LCU_BUF_SIZE >> 1]);   /* temporary decoding buffer for inter prediction (chroma) */
+    ALIGN32(pel8_t   buf_pixel_temp8  [LCU_BUF_SIZE]);        /* temporary pixel buffer, used for bi/dual-prediction */
+    ALIGN32(pel10_t   buf_pixel_temp10  [LCU_BUF_SIZE]);        /* temporary pixel buffer, used for bi/dual-prediction */
 
     /* predication buffers for all intra modes */
-    ALIGN32(pel_t   intra_pred  [NUM_INTRA_MODE       ][LCU_BUF_SIZE]);         /* for all 33 luma prediction modes */
-    ALIGN32(pel_t   intra_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]);    /* for all chroma intra prediction modes */
-    ALIGN32(pel_t   buf_edge_pixels[MAX_CU_SIZE << 3]);     /* reference pixels for intra luma/chroma prediction */
+    ALIGN32(pel8_t   intra8_pred  [NUM_INTRA_MODE       ][LCU_BUF_SIZE]);         /* for all 33 luma prediction modes */
+    ALIGN32(pel10_t   intra10_pred  [NUM_INTRA_MODE       ][LCU_BUF_SIZE]);         /* for all 33 luma prediction modes */
+    ALIGN32(pel8_t   intra8_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]);    /* for all chroma intra prediction modes */
+    ALIGN32(pel10_t   intra10_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]);    /* for all chroma intra prediction modes */
+    ALIGN32(pel8_t   buf_edge_pixels8[MAX_CU_SIZE << 3]);     /* reference pixels for intra luma/chroma prediction */
+    ALIGN32(pel10_t   buf_edge_pixels10[MAX_CU_SIZE << 3]);     /* reference pixels for intra luma/chroma prediction */
 
     runlevel_t       runlevel;         /* run level buffer for RDO */
 
@@ -1408,7 +1432,7 @@ struct xavs2_t {
     ALIGN32(xavs2_log_t    module_log);      /* log module */
     /* === BEGIN ===================================================
      * communal variables
-     * ÐòÁÐ¼¶£¨±àÂëµÄËùÓÐÖ¡£©¹²Ïí±äÁ¿ÇøÓò¿ªÊ¼
+     * åºåˆ—çº§ï¼ˆç¼–ç çš„æ‰€æœ‰å¸§ï¼‰å…±äº«å˜é‡åŒºåŸŸå¼€å§‹
      */
 
     ALIGN32(SYNC_VARS_1(communal_vars_1));
@@ -1436,7 +1460,7 @@ struct xavs2_t {
     bool_t      b_progressive;
     bool_t      b_field_sequence;
     bool_t      use_fractional_me;    /* whether use fractional Motion Estimation
-                                       * 0: ¹Ø±Õ·ÖÏñËØËÑË÷£»1: ¿ªÆô1/2·ÖÏñËØËÑË÷£»2:¿ªÆô1/4·ÖÏñËØËÑË÷
+                                       * 0: å…³é—­åˆ†åƒç´ æœç´¢ï¼›1: å¼€å¯1/2åˆ†åƒç´ æœç´¢ï¼›2:å¼€å¯1/4åˆ†åƒç´ æœç´¢
                                        */
     bool_t      use_fast_sub_me;      /* whether use fast quarter Motion Estimation: skip half fractional search point (from futl) */
     bool_t      UMH_big_hex_level;     /* whether skip big hex pattern when using UMH (from futl)
@@ -1467,8 +1491,11 @@ struct xavs2_t {
     int         min_mv_range[2];      /* mv range (min) decided by the level id */
     int         max_mv_range[2];      /* mv range (max) decided by the level id */
     /* function pointers */
-    int       (*get_intra_candidates_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                           pel_t *p_fenc, int mpm[], int blockidx,
+    int       (*get_intra_candidates_luma8)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                           pel8_t *p_fenc, int mpm[], int blockidx,
+                                           int block_x, int block_y, int block_w, int block_h);
+    int       (*get_intra_candidates_luma10)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                           pel10_t *p_fenc, int mpm[], int blockidx,
                                            int block_x, int block_y, int block_w, int block_h);
     int       (*get_intra_candidates_chroma)(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c,
                                              intra_candidate_t *p_candidate_list);
@@ -1477,8 +1504,8 @@ struct xavs2_t {
     uint8_t    *tab_avail_TR;         /* pointers to array of available table, Top Right */
     uint8_t    *tab_avail_DL;         /* pointers to array of available table, Down Left */
     uint8_t     tab_num_intra_rdo[MAX_CU_SIZE_IN_BIT + 1];    /* pointers to array of table, indicate numbers of intra prediction modes for RDO */
-    int8_t      num_intra_rmd_dist2;  /* ¾àÀë2µÄ½Ç¶ÈµÄËÑË÷ÊýÁ¿ */
-    int8_t      num_intra_rmd_dist1;  /* ¾àÀë1µÄ½Ç¶ÈµÄËÑË÷ÊýÁ¿ */
+    int8_t      num_intra_rmd_dist2;  /* è·ç¦»2çš„è§’åº¦çš„æœç´¢æ•°é‡ */
+    int8_t      num_intra_rmd_dist1;  /* è·ç¦»1çš„è§’åº¦çš„æœç´¢æ•°é‡ */
     int8_t      num_rdo_intra_chroma; /* number of RDO modes for intra chroma prediction */
 
     SYNC_VARS_2(communal_vars_2);
@@ -1486,7 +1513,7 @@ struct xavs2_t {
 
     /* === BEGIN ===================================================
      * row-dependent variables : values below need to be synchronized between rows
-     * Ö¡¼¶¹²Ïí±äÁ¿ÇøÓò¿ªÊ¼£¬Ã¿Ö¡µÄ¶à¸öÐÐ¼¶Ïß³ÌÖ®¼ä·ÃÎÊÏàÍ¬ÄÚÈÝ
+     * å¸§çº§å…±äº«å˜é‡åŒºåŸŸå¼€å§‹ï¼Œæ¯å¸§çš„å¤šä¸ªè¡Œçº§çº¿ç¨‹ä¹‹é—´è®¿é—®ç›¸åŒå†…å®¹
      */
     SYNC_VARS_1(row_vars_1);
 
@@ -1517,12 +1544,13 @@ struct xavs2_t {
     slice_t    *slices[MAX_SLICES];   /* all slices */
     int         i_slice_index;        /* slice index for the current thread */
 
-    /* ²»Í¬Slice²»Í¬µÄbuffer */
-    pel_t      *intra_border[3];      /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
+    /* ä¸åŒSliceä¸åŒçš„buffer */
+    pel8_t      *intra_border8[3];      /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
+    pel10_t      *intra_border10[3];      /* buffer for store decoded bottom pixels of the top lcu row (before filter) */
     uint8_t    *p_deblock_flag[2];    /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */
     int8_t     *ipredmode;            /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */
 
-    /* Ö¡¼¶Î¨Ò»µÄbuffer */
+    /* å¸§çº§å”¯ä¸€çš„buffer */
     int8_t     *lcu_slice_idx;        /* [i_height_in_lcu][i_width_in_lcu] */
     int8_t     *dir_pred;             /* [i_height_in_minpu][i_width_in_minpu], inter prediction direction */
     int8_t     *fwd_1st_ref;          /* [i_height_in_minpu][i_width_in_minpu] */
@@ -1537,7 +1565,7 @@ struct xavs2_t {
     double      thres_qsfd_cu[2][CTU_DEPTH];  /* QSFD threshold for inter frame, [0:inter, 1:intra][log2_cu_size - 3] */
 
     xavs2_frame_t *img_sao;          /* reconstruction image for SAO */
-    SAOStatData(*sao_stat_datas)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]; /* [lcu][comp][types], ¿É²»ÓÃÈ«¾Ö */
+    SAOStatData(*sao_stat_datas)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]; /* [lcu][comp][types], å¯ä¸ç”¨å…¨å±€ */
     SAOBlkParam(*sao_blk_params)[NUM_SAO_COMPONENTS];   /* [lcu][comp] */
     int        (*num_sao_lcu_off)[NUM_SAO_COMPONENTS];  /* [lcu_row][comp] */
     bool_t       slice_sao_on   [NUM_SAO_COMPONENTS];
@@ -1594,8 +1622,11 @@ struct xavs2_t {
         bool_t  b_2nd_rdcost_pass;    /* 2nd pass for RDCost update */
 
         /* function pointers for RDO */
-        int   (*get_intra_dir_for_rdo_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                            pel_t *p_fenc, int mpm[], int blockidx,
+        int   (*get_intra_dir_for_rdo_luma8)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                            pel8_t *p_fenc, int mpm[], int blockidx,
+                                            int block_x, int block_y, int block_w, int block_h);
+        int   (*get_intra_dir_for_rdo_luma10)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                            pel10_t *p_fenc, int mpm[], int blockidx,
                                             int block_x, int block_y, int block_w, int block_h);
         int   (*get_skip_mvs)(xavs2_t *h, cu_t *p_cu);  /* get MVs for skip/direct mode */
 
@@ -1606,8 +1637,10 @@ struct xavs2_t {
         cu_t   *p_ctu;                /* pointer to the top of current CTU */
 
         /* 2, enc/dec/pred Y/U/V pointers */
-        pel_t      *p_fdec[3];        /* [Y/U/V] pointer over lcu of the frame to be reconstructed */
-        pel_t      *p_fenc[3];        /* [Y/U/V] pointer over lcu of the frame to be compressed */
+        pel8_t      *p_fdec8[3];        /* [Y/U/V] pointer over lcu of the frame to be reconstructed */
+        pel10_t      *p_fdec10[3];        /* [Y/U/V] pointer over lcu of the frame to be reconstructed */
+        pel8_t      *p_fenc8[3];        /* [Y/U/V] pointer over lcu of the frame to be compressed */
+        pel10_t      *p_fenc10[3];        /* [Y/U/V] pointer over lcu of the frame to be compressed */
 
         coeff_t    *lcu_coeff[3];     /* [Y/U/V] coefficients of LCU */
 
@@ -1619,15 +1652,22 @@ struct xavs2_t {
 #if PARALLEL_INSIDE_CTU
         cu_parallel_t   cu_enc  [CTU_DEPTH];
 #else
-        cu_parallel_t   cu_enc  [1];                /* ÎÞCTUÄÚµÄ¶àÏß³ÌÊ±£¬Ö»ÐèÒªÒ»¸ö */
+        cu_parallel_t   cu_enc  [1];                /* æ— CTUå†…çš„å¤šçº¿ç¨‹æ—¶ï¼Œåªéœ€è¦ä¸€ä¸ª */
 #endif
 
-        ALIGN32(pel_t   fenc_buf[FENC_BUF_SIZE]);   /* encoding buffer (source Y/U/V buffer) */
-        ALIGN32(pel_t   fdec_buf[FDEC_BUF_SIZE]);   /* decoding buffer (Reconstruction Y/U/V buffer) */
-        struct lcu_intra_border_t {
-            ALIGN32(pel_t rec_left[MAX_CU_SIZE]);          /* Left border of current LCU */
-            ALIGN32(pel_t rec_top[MAX_CU_SIZE * 2 + 32]);  /* top-left, top and top-right samples (Reconstruction) of current LCU */
-        } ctu_border[IMG_CMPNTS];                   /* Y, U, V components */
+        ALIGN32(pel8_t   fenc_buf8[FENC_BUF_SIZE]);   /* encoding buffer (source Y/U/V buffer) */
+        ALIGN32(pel8_t   fdec_buf8[FDEC_BUF_SIZE]);   /* decoding buffer (Reconstruction Y/U/V buffer) */
+        ALIGN32(pel10_t   fenc_buf10[FENC_BUF_SIZE]);   /* encoding buffer (source Y/U/V buffer) */
+        ALIGN32(pel10_t   fdec_buf10[FDEC_BUF_SIZE]);   /* decoding buffer (Reconstruction Y/U/V buffer) */
+        struct lcu_intra_border8_t {
+            ALIGN32(pel8_t rec_left[MAX_CU_SIZE]);          /* Left border of current LCU */
+            ALIGN32(pel8_t rec_top[MAX_CU_SIZE * 2 + 32]);  /* top-left, top and top-right samples (Reconstruction) of current LCU */
+        } ctu_border8[IMG_CMPNTS];                   /* Y, U, V components */
+
+        struct lcu_intra_border10_t {
+            ALIGN32(pel10_t rec_left[MAX_CU_SIZE]);          /* Left border of current LCU */
+            ALIGN32(pel10_t rec_top[MAX_CU_SIZE * 2 + 32]);  /* top-left, top and top-right samples (Reconstruction) of current LCU */
+        } ctu_border10[IMG_CMPNTS];                   /* Y, U, V components */
 
         /* buffer for the coding tree units */
         ALIGN16(cu_t    all_cu[85]);                /* all cu: 1(64x64) + 4(32x32) + 16(16x16) + 64(8x8) = 85 */
@@ -1640,7 +1680,7 @@ struct xavs2_t {
     /* coding states in RDO, independent for each thread */
     struct coding_states {
 
-        /* Ö»ÓÃÓÚ±¸·ÝÉÏÏÂÎÄ×´Ì¬£¬ÎÞÐè³õÊ¼»¯ */
+        /* åªç”¨äºŽå¤‡ä»½ä¸Šä¸‹æ–‡çŠ¶æ€ï¼Œæ— éœ€åˆå§‹åŒ– */
         aec_t  cs_sao_start;
         aec_t  cs_sao_best;
         aec_t  cs_sao_temp;
diff --git a/source/common/cudata.c b/source/common/cudata.c
index 04e20f0..d84d534 100644
--- a/source/common/cudata.c
+++ b/source/common/cudata.c
@@ -192,13 +192,31 @@ void cu_get_mvds(xavs2_t *h, cu_t *p_cu)
 /* ---------------------------------------------------------------------------
  * copy one block (multi-planes)
  */
-static void block_copy_x3(pel_t *p_dst[], int i_dst[], pel_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes)
+static void block_copy8_x3(pel8_t *p_dst[], int i_dst[], pel8_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes)
 {
-    pel_t *dst, *src;
+    pel8_t *dst, *src;
     int y, k;
 
     for (k = 0; k < i_planes; k++) {
-        int i_size = i_width[k] * sizeof(pel_t);
+        int i_size = i_width[k] * sizeof(pel8_t);
+        memcpy_t f_memcpy = i_size & 15 ? memcpy : g_funcs.memcpy_aligned;
+        dst = p_dst[k];
+        src = p_src[k];
+        for (y = i_height[k]; y != 0; y--) {
+            f_memcpy(dst, src, i_size);
+            dst += i_dst[k];
+            src += i_src[k];
+        }
+    }
+}
+
+static void block_copy10_x3(pel10_t *p_dst[], int i_dst[], pel10_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes)
+{
+    pel10_t *dst, *src;
+    int y, k;
+
+    for (k = 0; k < i_planes; k++) {
+        int i_size = i_width[k] * sizeof(pel10_t);
         memcpy_t f_memcpy = i_size & 15 ? memcpy : g_funcs.memcpy_aligned;
         dst = p_dst[k];
         src = p_src[k];
@@ -213,7 +231,19 @@ static void block_copy_x3(pel_t *p_dst[], int i_dst[], pel_t *p_src[], int i_src
 /* ---------------------------------------------------------------------------
  */
 static ALWAYS_INLINE void
-xavs2_copy_col1(pel_t *dst, pel_t *src, const int height, const int stride)
+xavs2_copy_col18(pel8_t *dst, pel8_t *src, const int height, const int stride)
+{
+    int i;
+    int k = 0;
+
+    for (i = height; i != 0; i--) {
+        dst[k] = src[k];
+        k += stride;
+    }
+}
+
+static ALWAYS_INLINE void
+xavs2_copy_col110(pel10_t *dst, pel10_t *src, const int height, const int stride)
 {
     int i;
     int k = 0;
@@ -228,13 +258,28 @@ xavs2_copy_col1(pel_t *dst, pel_t *src, const int height, const int stride)
  * cache CTU border
  */
 static INLINE
-void xavs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top,
-                            const pel_t *p_left, int i_left,
+void xavs2_cache_lcu_border8(pel8_t *p_dst, const pel8_t *p_top,
+                            const pel8_t *p_left, int i_left,
+                            int lcu_width, int lcu_height)
+{
+    int i;
+    /* top, top-right */
+    memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel8_t));
+    /* left */
+    for (i = 1; i <= lcu_height; i++) {
+        p_dst[-i] = p_left[0];
+        p_left += i_left;
+    }
+}
+
+static INLINE
+void xavs2_cache_lcu_border10(pel10_t *p_dst, const pel10_t *p_top,
+                            const pel10_t *p_left, int i_left,
                             int lcu_width, int lcu_height)
 {
     int i;
     /* top, top-right */
-    memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel_t));
+    memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel10_t));
     /* left */
     for (i = 1; i <= lcu_height; i++) {
         p_dst[-i] = p_left[0];
@@ -246,14 +291,32 @@ void xavs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top,
  * cache CTU border (UV components together)
  */
 static INLINE
-void xavs2_cache_lcu_border_uv(pel_t *p_dst_u, const pel_t *p_top_u, const pel_t *p_left_u,
-                               pel_t *p_dst_v, const pel_t *p_top_v, const pel_t *p_left_v,
+void xavs2_cache_lcu_border8_uv(pel8_t *p_dst_u, const pel8_t *p_top_u, const pel8_t *p_left_u,
+                               pel8_t *p_dst_v, const pel8_t *p_top_v, const pel8_t *p_left_v,
+                               int i_left, int lcu_width, int lcu_height)
+{
+    int i;
+    /* top, top-right */
+    memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel8_t));
+    memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel8_t));
+    /* left */
+    for (i = 1; i <= lcu_height; i++) {
+        p_dst_u[-i] = p_left_u[0];
+        p_dst_v[-i] = p_left_v[0];
+        p_left_u += i_left;
+        p_left_v += i_left;
+    }
+}
+
+static INLINE
+void xavs2_cache_lcu_border10_uv(pel10_t *p_dst_u, const pel10_t *p_top_u, const pel10_t *p_left_u,
+                               pel10_t *p_dst_v, const pel10_t *p_top_v, const pel10_t *p_left_v,
                                int i_left, int lcu_width, int lcu_height)
 {
     int i;
     /* top, top-right */
-    memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel_t));
-    memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel_t));
+    memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel10_t));
+    memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel10_t));
     /* left */
     for (i = 1; i <= lcu_height; i++) {
         p_dst_u[-i] = p_left_u[0];
@@ -324,7 +387,7 @@ void lcu_start_init_pos(xavs2_t *h, int i_lcu_x, int i_lcu_y)
             p_cu_info->i_cu_qp     = (int8_t)(h->i_qp);   // needed in loop filter (even if constant QP is used)
 
             // reset syntax element entries in cu_info_t
-            // ÕâÐ©ÔªËØÔÚ±àÂëÃ¿¸öLCUÊ±»áÉèÖÃ£¬ËùÒÔ´Ë´¦²»ÐèÒªÐÞ¸Ä
+            // è¿™äº›å…ƒç´ åœ¨ç¼–ç æ¯ä¸ªLCUæ—¶ä¼šè®¾ç½®ï¼Œæ‰€ä»¥æ­¤å¤„ä¸éœ€è¦ä¿®æ”¹
             // p_cu_info->i_mode  = PRED_SKIP;
             // p_cu_info->i_cbp   = 0;
             // p_cu_info->i_level = MIN_CU_SIZE_IN_BIT;
@@ -348,8 +411,9 @@ void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y)
     int blk_h[3];
     int i_src[3];
     int i_dst[3];
-    pel_t *p_src[3];
-    pel_t *p_dst[3];
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_src[3];
+    pel8_t *p_dst[3];
 
     /* -------------------------------------------------------------
      * 1, copy LCU pixel data from original image buffer
@@ -357,34 +421,73 @@ void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y)
     i_src[0] = h->fenc->i_stride[0];
     i_src[1] = h->fenc->i_stride[1];
     i_src[2] = h->fenc->i_stride[2];
-    p_src[0] = h->fenc->planes[0] + (img_y     ) * i_src[0] + (img_x     );
-    p_src[1] = h->fenc->planes[1] + (img_y >> 1) * i_src[1] + (img_x >> 1);
-    p_src[2] = h->fenc->planes[2] + (img_y >> 1) * i_src[2] + (img_x >> 1);
+    p_src[0] = h->fenc->planes8[0] + (img_y     ) * i_src[0] + (img_x     );
+    p_src[1] = h->fenc->planes8[1] + (img_y >> 1) * i_src[1] + (img_x >> 1);
+    p_src[2] = h->fenc->planes8[2] + (img_y >> 1) * i_src[2] + (img_x >> 1);
 
     i_dst[0] = i_dst[1] = i_dst[2] = FENC_STRIDE;
-    p_dst[0] = h->lcu.p_fenc[0];
-    p_dst[1] = h->lcu.p_fenc[1];
-    p_dst[2] = h->lcu.p_fenc[2];
+    p_dst[0] = h->lcu.p_fenc8[0];
+    p_dst[1] = h->lcu.p_fenc8[1];
+    p_dst[2] = h->lcu.p_fenc8[2];
 
     blk_w[0] = lcu_width;
     blk_h[0] = lcu_height;
     blk_w[1] = blk_w[2] = lcu_width  >> 1;
     blk_h[1] = blk_h[2] = lcu_height >> 1;
-    block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
+    block_copy8_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
 
     /* first CTU of LCU row */
     if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) {
         if (img_x == 0) {
-            memcpy(h->lcu.ctu_border[0].rec_top + 1, h->intra_border[0], lcu_width * 2 * sizeof(pel_t));
-            memcpy(h->lcu.ctu_border[1].rec_top + 1, h->intra_border[1], lcu_width * sizeof(pel_t));
-            memcpy(h->lcu.ctu_border[2].rec_top + 1, h->intra_border[2], lcu_width * sizeof(pel_t));
+            memcpy(h->lcu.ctu_border8[0].rec_top + 1, h->intra_border8[0], lcu_width * 2 * sizeof(pel8_t));
+            memcpy(h->lcu.ctu_border8[1].rec_top + 1, h->intra_border8[1], lcu_width * sizeof(pel8_t));
+            memcpy(h->lcu.ctu_border8[2].rec_top + 1, h->intra_border8[2], lcu_width * sizeof(pel8_t));
         } else if (h->param->i_lcurow_threads > 1) {
             /* top-right pixels */
-            memcpy(h->lcu.ctu_border[0].rec_top + 1 + lcu_width,        h->intra_border[0] + img_x + lcu_width, lcu_width * sizeof(pel_t));
-            memcpy(h->lcu.ctu_border[1].rec_top + 1 + (lcu_width >> 1), h->intra_border[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t));
-            memcpy(h->lcu.ctu_border[2].rec_top + 1 + (lcu_width >> 1), h->intra_border[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t));
+            memcpy(h->lcu.ctu_border8[0].rec_top + 1 + lcu_width,        h->intra_border8[0] + img_x + lcu_width, lcu_width * sizeof(pel8_t));
+            memcpy(h->lcu.ctu_border8[1].rec_top + 1 + (lcu_width >> 1), h->intra_border8[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel8_t));
+            memcpy(h->lcu.ctu_border8[2].rec_top + 1 + (lcu_width >> 1), h->intra_border8[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel8_t));
         }
     }
+    } else {
+    pel10_t *p_src[3];
+    pel10_t *p_dst[3];
+
+    /* -------------------------------------------------------------
+     * 1, copy LCU pixel data from original image buffer
+     */
+    i_src[0] = h->fenc->i_stride[0];
+    i_src[1] = h->fenc->i_stride[1];
+    i_src[2] = h->fenc->i_stride[2];
+    p_src[0] = h->fenc->planes10[0] + (img_y     ) * i_src[0] + (img_x     );
+    p_src[1] = h->fenc->planes10[1] + (img_y >> 1) * i_src[1] + (img_x >> 1);
+    p_src[2] = h->fenc->planes10[2] + (img_y >> 1) * i_src[2] + (img_x >> 1);
+
+    i_dst[0] = i_dst[1] = i_dst[2] = FENC_STRIDE;
+    p_dst[0] = h->lcu.p_fenc10[0];
+    p_dst[1] = h->lcu.p_fenc10[1];
+    p_dst[2] = h->lcu.p_fenc10[2];
+
+    blk_w[0] = lcu_width;
+    blk_h[0] = lcu_height;
+    blk_w[1] = blk_w[2] = lcu_width  >> 1;
+    blk_h[1] = blk_h[2] = lcu_height >> 1;
+    block_copy10_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
+
+    /* first CTU of LCU row */
+    if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) {
+        if (img_x == 0) {
+            memcpy(h->lcu.ctu_border10[0].rec_top + 1, h->intra_border10[0], lcu_width * 2 * sizeof(pel10_t));
+            memcpy(h->lcu.ctu_border10[1].rec_top + 1, h->intra_border10[1], lcu_width * sizeof(pel10_t));
+            memcpy(h->lcu.ctu_border10[2].rec_top + 1, h->intra_border10[2], lcu_width * sizeof(pel10_t));
+        } else if (h->param->i_lcurow_threads > 1) {
+            /* top-right pixels */
+            memcpy(h->lcu.ctu_border10[0].rec_top + 1 + lcu_width,        h->intra_border10[0] + img_x + lcu_width, lcu_width * sizeof(pel10_t));
+            memcpy(h->lcu.ctu_border10[1].rec_top + 1 + (lcu_width >> 1), h->intra_border10[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel10_t));
+            memcpy(h->lcu.ctu_border10[2].rec_top + 1 + (lcu_width >> 1), h->intra_border10[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel10_t));
+        }
+    }
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -404,8 +507,9 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y)
     int blk_h[3];
     int i_src[3];
     int i_dst[3];
-    pel_t *p_src[3];
-    pel_t *p_dst[3];
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_src[3];
+    pel8_t *p_dst[3];
 
     /* -------------------------------------------------------------
      * 1, copy decoded LCU to frame buffer
@@ -413,20 +517,20 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y)
     i_dst[0] = h->fdec->i_stride[0];
     i_dst[1] = h->fdec->i_stride[1];
     i_dst[2] = h->fdec->i_stride[2];
-    p_dst[0] = h->fdec->planes[0] + (img_y) * i_dst[0] + (img_x);
-    p_dst[1] = h->fdec->planes[1] + (img_y_c) * i_dst[1] + (img_x_c);
-    p_dst[2] = h->fdec->planes[2] + (img_y_c) * i_dst[2] + (img_x_c);
+    p_dst[0] = h->fdec->planes8[0] + (img_y) * i_dst[0] + (img_x);
+    p_dst[1] = h->fdec->planes8[1] + (img_y_c) * i_dst[1] + (img_x_c);
+    p_dst[2] = h->fdec->planes8[2] + (img_y_c) * i_dst[2] + (img_x_c);
 
     i_src[0] = i_src[1] = i_src[2] = FDEC_STRIDE;
-    p_src[0] = h->lcu.p_fdec[0];
-    p_src[1] = h->lcu.p_fdec[1];
-    p_src[2] = h->lcu.p_fdec[2];
+    p_src[0] = h->lcu.p_fdec8[0];
+    p_src[1] = h->lcu.p_fdec8[1];
+    p_src[2] = h->lcu.p_fdec8[2];
 
     blk_w[0] = lcu_width;
     blk_h[0] = lcu_height;
     blk_w[1] = blk_w[2] = lcu_width_c;
     blk_h[1] = blk_h[2] = lcu_height_c;
-    block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
+    block_copy8_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
 
     /* -------------------------------------------------------------
      * 2, backup right col and bottom row pixels for intra coding
@@ -440,18 +544,68 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y)
                i_pred_mode_width_in_lcu * sizeof(int8_t));
 
         /* cache top and left samples for intra prediction of next CTU */
-        xavs2_cache_lcu_border(h->lcu.ctu_border[0].rec_top, h->intra_border[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1,
+        xavs2_cache_lcu_border8(h->lcu.ctu_border8[0].rec_top, h->intra_border8[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1,
                                FDEC_STRIDE, lcu_width, lcu_height);
-        xavs2_cache_lcu_border_uv(h->lcu.ctu_border[1].rec_top, h->intra_border[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1,
-                                  h->lcu.ctu_border[2].rec_top, h->intra_border[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1,
+        xavs2_cache_lcu_border8_uv(h->lcu.ctu_border8[1].rec_top, h->intra_border8[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1,
+                                  h->lcu.ctu_border8[2].rec_top, h->intra_border8[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1,
                                   FDEC_STRIDE, lcu_width_c, lcu_height_c);
 
         /* 2.2, backup bottom row pixels */
         if (i_lcu_y < h->i_height_in_lcu - 1) {
-            g_funcs.fast_memcpy(h->intra_border[0] + img_x,   p_src[0] + (lcu_height   - 1) * FDEC_STRIDE, lcu_width   * sizeof(pel_t));
-            g_funcs.fast_memcpy(h->intra_border[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t));
-            g_funcs.fast_memcpy(h->intra_border[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t));
+            g_funcs.fast_memcpy(h->intra_border8[0] + img_x,   p_src[0] + (lcu_height   - 1) * FDEC_STRIDE, lcu_width   * sizeof(pel8_t));
+            g_funcs.fast_memcpy(h->intra_border8[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel8_t));
+            g_funcs.fast_memcpy(h->intra_border8[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel8_t));
         }
     }
+    } else {
+    pel10_t *p_src[3];
+    pel10_t *p_dst[3];
+
+    /* -------------------------------------------------------------
+     * 1, copy decoded LCU to frame buffer
+     */
+    i_dst[0] = h->fdec->i_stride[0];
+    i_dst[1] = h->fdec->i_stride[1];
+    i_dst[2] = h->fdec->i_stride[2];
+    p_dst[0] = h->fdec->planes10[0] + (img_y) * i_dst[0] + (img_x);
+    p_dst[1] = h->fdec->planes10[1] + (img_y_c) * i_dst[1] + (img_x_c);
+    p_dst[2] = h->fdec->planes10[2] + (img_y_c) * i_dst[2] + (img_x_c);
 
+    i_src[0] = i_src[1] = i_src[2] = FDEC_STRIDE;
+    p_src[0] = h->lcu.p_fdec10[0];
+    p_src[1] = h->lcu.p_fdec10[1];
+    p_src[2] = h->lcu.p_fdec10[2];
+
+    blk_w[0] = lcu_width;
+    blk_h[0] = lcu_height;
+    blk_w[1] = blk_w[2] = lcu_width_c;
+    blk_h[1] = blk_h[2] = lcu_height_c;
+    block_copy10_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3);
+
+    /* -------------------------------------------------------------
+     * 2, backup right col and bottom row pixels for intra coding
+     */
+    if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) {
+        // backup intra pred mode of bottom 4x4 row
+        int i_pred_mode_stride = h->i_width_in_minpu + 16;
+        int i_pred_mode_width_in_lcu = (1 << h->i_lcu_level) >> MIN_PU_SIZE_IN_BIT;
+        memcpy(h->ipredmode - i_pred_mode_stride + i_lcu_x * i_pred_mode_width_in_lcu,
+               h->ipredmode + i_pred_mode_stride * (i_pred_mode_width_in_lcu - 1) + i_lcu_x * i_pred_mode_width_in_lcu,
+               i_pred_mode_width_in_lcu * sizeof(int8_t));
+
+        /* cache top and left samples for intra prediction of next CTU */
+        xavs2_cache_lcu_border10(h->lcu.ctu_border10[0].rec_top, h->intra_border10[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1,
+                               FDEC_STRIDE, lcu_width, lcu_height);
+        xavs2_cache_lcu_border10_uv(h->lcu.ctu_border10[1].rec_top, h->intra_border10[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1,
+                                  h->lcu.ctu_border10[2].rec_top, h->intra_border10[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1,
+                                  FDEC_STRIDE, lcu_width_c, lcu_height_c);
+
+        /* 2.2, backup bottom row pixels */
+        if (i_lcu_y < h->i_height_in_lcu - 1) {
+            g_funcs.fast_memcpy(h->intra_border10[0] + img_x,   p_src[0] + (lcu_height   - 1) * FDEC_STRIDE, lcu_width   * sizeof(pel10_t));
+            g_funcs.fast_memcpy(h->intra_border10[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel10_t));
+            g_funcs.fast_memcpy(h->intra_border10[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel10_t));
+        }
+    }
+    }
 }
diff --git a/source/common/cudata.h b/source/common/cudata.h
index c867091..de8edf3 100644
--- a/source/common/cudata.h
+++ b/source/common/cudata.h
@@ -49,7 +49,7 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y);
  */
 static ALWAYS_INLINE int clip_qp(xavs2_t *h, int i_qp)
 {
-    /* AVS2-P2£º Í¼ÏñÁ¿»¯Òò×Ó  picture_qp */
+    /* AVS2-P2Â£Âº ÃÂ¼ÃÃ±ÃÂ¿Â»Â¯Ã’Ã²Ã—Ã“  picture_qp */
     int max_qp = MAX_QP + (h->param->sample_bit_depth - 8) * 8;
     return XAVS2_MAX(MIN_QP, XAVS2_MIN(max_qp, i_qp));
 }
@@ -117,10 +117,21 @@ int cu_get_slice_index(xavs2_t *h, int scu_x, int scu_y)
  */
 static ALWAYS_INLINE int cu_get_chroma_qp(xavs2_t *h, int luma_qp, int uv)
 {
-    int QP;
-    UNUSED_PARAMETER(uv);
-    UNUSED_PARAMETER(h);
-    QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63, luma_qp)];
+    //printf("luma_qp1: %d\n", luma_qp);
+    int QP = luma_qp + (uv == 0 ? h->param->chroma_quant_param_delta_u : h->param->chroma_quant_param_delta_v);
+    //printf("luma_qp2: %d\n", QP);
+    //UNUSED_PARAMETER(uv);
+    //UNUSED_PARAMETER(h);
+//if (h->param->sample_bit_depth > 8) {
+    const int bit_depth_offset = ((h->param->sample_bit_depth - 8) << 3);
+    QP -= bit_depth_offset;
+    //printf("bit_depth_offset: %d\n", bit_depth_offset);
+    QP = QP < 0 ? QP : tab_qp_scale_chroma[QP];
+    //printf("QP: %d\n", QP);
+    QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63 + bit_depth_offset, QP + bit_depth_offset)];
+//} else {
+    //QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63, luma_qp)];
+//}
     return QP;
 }
 
diff --git a/source/common/defines.h b/source/common/defines.h
index 575f0ce..2815930 100644
--- a/source/common/defines.h
+++ b/source/common/defines.h
@@ -57,7 +57,7 @@
  * ===========================================================================
  */
 
-/* ¼ì²éËã·¨ÊÇ·ñ¿ªÆô */
+/* æ£€æŸ¥ç®—æ³•æ˜¯å¦å¼€å¯ */
 #define IS_ALG_ENABLE(alg)  ((h->i_fast_algs >> alg) & 1)
 
 /* ---------------------------------------------------------------------------
@@ -65,66 +65,66 @@
  */
 enum xavs2_fast_algorithms_e {
     /* fast inter */
-    OPT_EARLY_SKIP           ,        /* »ùÓÚÊ±¿ÕÏà¹ØÐÔµÄ¿ìËÙSKIP¾ö²ß */
-    OPT_PSC_MD               ,        /* »ùÓÚÊ±¿ÕÏà¹ØÐÔµÄ¿ìËÙÄ£Ê½¾ö²ß (prediction size correlation based mode decision) */
-    OPT_FAST_CBF_MODE        ,        /* »ùÓÚ×îÓÅ»®·ÖÄ£Ê½µÄCBF¿ìËÙÌø¹ýÊ£ÓàµÄ»®·ÖÄ£Ê½ */
-    OPT_FAST_PU_SEL          ,        /* OPT_FAST_CBF_MODEµÄ¼ò»¯Ëã·¨£¬cbf=0Ê±£¬Èô2Nx2N²»ÓÅÓÚSKIP£¬ÔòÌø¹ýÊ£ÓàÖ¡¼äÄ£Ê½ºÍÖ¡ÄÚÄ£Ê½ */
-    OPT_BYPASS_AMP           ,        /* Èç¹ûPRED_2NxNÎ´»ñµÃ×îÓÅ£¬Ö±½ÓÌø¹ýÏàÍ¬»®·Ö·½ÏòµÄPRED_2NxnU/PRED_2NxnD; PRED_Nx2NÍ¬Àí */
-    OPT_DMH_CANDIDATE        ,        /* ÓÃÓÚ¾«¼òDMHÄ£Ê½ÏÂµÄRDO´ÎÊý */
-    OPT_BYPASS_MODE_FPIC     ,        /* FÖ¡ÖÐµÄÖ¡ÄÚÄ£Ê½ÓëDMHÄ£Ê½Ìø¹ý */
-    OPT_ADVANCE_CHROMA_AEC   ,        /* ÌáÇ°É«¶È¿éµÄ±ä»»ÏµÊý±àÂë¹ý³Ì */
+    OPT_EARLY_SKIP           ,        /* åŸºäºŽæ—¶ç©ºç›¸å…³æ€§çš„å¿«é€ŸSKIPå†³ç­– */
+    OPT_PSC_MD               ,        /* åŸºäºŽæ—¶ç©ºç›¸å…³æ€§çš„å¿«é€Ÿæ¨¡å¼å†³ç­– (prediction size correlation based mode decision) */
+    OPT_FAST_CBF_MODE        ,        /* åŸºäºŽæœ€ä¼˜åˆ’åˆ†æ¨¡å¼çš„CBFå¿«é€Ÿè·³è¿‡å‰©ä½™çš„åˆ’åˆ†æ¨¡å¼ */
+    OPT_FAST_PU_SEL          ,        /* OPT_FAST_CBF_MODEçš„ç®€åŒ–ç®—æ³•ï¼Œcbf=0æ—¶ï¼Œè‹¥2Nx2Nä¸ä¼˜äºŽSKIPï¼Œåˆ™è·³è¿‡å‰©ä½™å¸§é—´æ¨¡å¼å’Œå¸§å†…æ¨¡å¼ */
+    OPT_BYPASS_AMP           ,        /* å¦‚æžœPRED_2NxNæœªèŽ·å¾—æœ€ä¼˜ï¼Œç›´æŽ¥è·³è¿‡ç›¸åŒåˆ’åˆ†æ–¹å‘çš„PRED_2NxnU/PRED_2NxnD; PRED_Nx2NåŒç† */
+    OPT_DMH_CANDIDATE        ,        /* ç”¨äºŽç²¾ç®€DMHæ¨¡å¼ä¸‹çš„RDOæ¬¡æ•° */
+    OPT_BYPASS_MODE_FPIC     ,        /* Få¸§ä¸­çš„å¸§å†…æ¨¡å¼ä¸ŽDMHæ¨¡å¼è·³è¿‡ */
+    OPT_ADVANCE_CHROMA_AEC   ,        /* æå‰è‰²åº¦å—çš„å˜æ¢ç³»æ•°ç¼–ç è¿‡ç¨‹ */
     OPT_ROUGH_MODE_SKIP      ,        /* */
-    OPT_CMS_ETMD             ,        /* Ìõ¼þÌø¹ýÖ¡ÄÚ»®·Ö·½Ê½£º
-                                       * £¨1£©ÈôI_2Nx2N²»ÓÅÓÚÖ¡¼äÔ¤²âÄ£Ê½£¬Ôò²»±éÀúÖ¡ÄÚÆäËû»®·Ö£»
-                                       * £¨2£©Ö¡¼ä×îÓÅÄ£Ê½µÄCBPÎªÁãÊ±Ìø¹ýÖ¡ÄÚ»®·Ö·½Ê½¡£*/
-    OPT_ROUGH_PU_SEL         ,        /* ´ÖÂÔµÄPU»®·ÖÄ£Ê½ËÑË÷ */
-    OPT_CBP_DIRECT           ,        /* ¸ù¾ÝdirectÄ£Ê½ÏÂ²Ð²îÊÇ·ñÎªÈ«Áã¿é£¬Ìø¹ýPU»®·ÖºÍCUµÝ¹é»®·Ö */
-    OPT_SKIP_DMH_THRES       ,        /* Í¨¹ýDistortionµÄãÐÖµ¾ö¶¨Ìø¹ýDMHÄ£Ê½µÄ±éÀú */
-    OPT_ROUGH_SKIP_SEL       ,        /* Í¨¹ýdistortion¶Ô±ÈÖ»¶Ô¸ö±ðskip/directÄ£Ê½×öRDO */
+    OPT_CMS_ETMD             ,        /* æ¡ä»¶è·³è¿‡å¸§å†…åˆ’åˆ†æ–¹å¼ï¼š
+                                       * ï¼ˆ1ï¼‰è‹¥I_2Nx2Nä¸ä¼˜äºŽå¸§é—´é¢„æµ‹æ¨¡å¼ï¼Œåˆ™ä¸éåŽ†å¸§å†…å…¶ä»–åˆ’åˆ†ï¼›
+                                       * ï¼ˆ2ï¼‰å¸§é—´æœ€ä¼˜æ¨¡å¼çš„CBPä¸ºé›¶æ—¶è·³è¿‡å¸§å†…åˆ’åˆ†æ–¹å¼ã€‚*/
+    OPT_ROUGH_PU_SEL         ,        /* ç²—ç•¥çš„PUåˆ’åˆ†æ¨¡å¼æœç´¢ */
+    OPT_CBP_DIRECT           ,        /* æ ¹æ®directæ¨¡å¼ä¸‹æ®‹å·®æ˜¯å¦ä¸ºå…¨é›¶å—ï¼Œè·³è¿‡PUåˆ’åˆ†å’ŒCUé€’å½’åˆ’åˆ† */
+    OPT_SKIP_DMH_THRES       ,        /* é€šè¿‡Distortionçš„é˜ˆå€¼å†³å®šè·³è¿‡DMHæ¨¡å¼çš„éåŽ† */
+    OPT_ROUGH_SKIP_SEL       ,        /* é€šè¿‡distortionå¯¹æ¯”åªå¯¹ä¸ªåˆ«skip/directæ¨¡å¼åšRDO */
 
     /* fast intra */
-    OPT_BYPASS_SDIP          ,        /* Èç¹ûPRED_I_2NxnÒÑ»ñ×îÓÅ£¬Ö±½ÓÌø¹ýPRED_I_nx2N */
-    OPT_FAST_INTRA_MODE      ,        /* Ö¡ÄÚÄ£Ê½¿ìËÙ¾ö²ß */
-    OPT_FAST_RDO_INTRA_C     ,        /* ¿ìËÙÖ¡ÄÚChromaÔ¤²âÄ£Ê½ÓÅ»¯£¬¼õÉÙÉ«¶È·ÖÁ¿¾ö²ßÊýÁ¿ */
-    OPT_ET_RDO_INTRA_L       ,        /* Luma RDO¹ý³ÌÌáÇ°ÍË³ö²ßÂÔ */
-    OPT_ET_INTRA_DEPTH       ,        /* »ùÓÚMADÖµµÄIÖ¡depth»®·ÖÌáÇ°ÖÕÖ¹ */
-    OPT_BYPASS_INTRA_BPIC    ,        /* BÖ¡ÖÐÈôÖ¡¼äÔ¤²âÄ£Ê½µÄCBPÎªÁã£¬ÔòÌø¹ýÖ¡ÄÚÔ¤²âÄ£Ê½¾ö²ß */
-    OPT_FAST_INTRA_IN_INTER  ,        /* ÒÀ¾Ý×ÓCUµÄ×îÓÅÄ£Ê½ÊÇ·ñÖ¡ÄÚ¼°µ±Ç°CUµÄÖ¡¼äÄ£Ê½RDCost½ûÓÃÖ¡¼äµÄÖ¡ÄÚÄ£Ê½ */
+    OPT_BYPASS_SDIP          ,        /* å¦‚æžœPRED_I_2Nxnå·²èŽ·æœ€ä¼˜ï¼Œç›´æŽ¥è·³è¿‡PRED_I_nx2N */
+    OPT_FAST_INTRA_MODE      ,        /* å¸§å†…æ¨¡å¼å¿«é€Ÿå†³ç­– */
+    OPT_FAST_RDO_INTRA_C     ,        /* å¿«é€Ÿå¸§å†…Chromaé¢„æµ‹æ¨¡å¼ä¼˜åŒ–ï¼Œå‡å°‘è‰²åº¦åˆ†é‡å†³ç­–æ•°é‡ */
+    OPT_ET_RDO_INTRA_L       ,        /* Luma RDOè¿‡ç¨‹æå‰é€€å‡ºç­–ç•¥ */
+    OPT_ET_INTRA_DEPTH       ,        /* åŸºäºŽMADå€¼çš„Iå¸§depthåˆ’åˆ†æå‰ç»ˆæ­¢ */
+    OPT_BYPASS_INTRA_BPIC    ,        /* Bå¸§ä¸­è‹¥å¸§é—´é¢„æµ‹æ¨¡å¼çš„CBPä¸ºé›¶ï¼Œåˆ™è·³è¿‡å¸§å†…é¢„æµ‹æ¨¡å¼å†³ç­– */
+    OPT_FAST_INTRA_IN_INTER  ,        /* ä¾æ®å­CUçš„æœ€ä¼˜æ¨¡å¼æ˜¯å¦å¸§å†…åŠå½“å‰CUçš„å¸§é—´æ¨¡å¼RDCostç¦ç”¨å¸§é—´çš„å¸§å†…æ¨¡å¼ */
 
     /* fast CU depth */
-    OPT_ECU                  ,        /* HMÖÐÈ«ÁãSKIPÄ£Ê½ÖÕÖ¹ÏÂ²ã»®·Ö */
+    OPT_ECU                  ,        /* HMä¸­å…¨é›¶SKIPæ¨¡å¼ç»ˆæ­¢ä¸‹å±‚åˆ’åˆ† */
     OPT_ET_HOMO_MV           ,        /* */
     OPT_CU_CSET              ,        /* CSET of uAVS2, Only for inter frames that are not referenced by others */
-    OPT_CU_DEPTH_CTRL        ,        /* »ùÓÚÊ±¿ÕÏà¹ØÐÔµÄDepth¹À¼Æ£¬ÒÀ¾ÝÉÏ¡¢×ó¡¢×óÉÏ¡¢ÓÒÉÏºÍÊ±Óò²Î¿¼¿élevelµ÷ÕûDEPTH·¶Î§£¬È«IÖ¡Ò²ÊÊÓÃ */
+    OPT_CU_DEPTH_CTRL        ,        /* åŸºäºŽæ—¶ç©ºç›¸å…³æ€§çš„Depthä¼°è®¡ï¼Œä¾æ®ä¸Šã€å·¦ã€å·¦ä¸Šã€å³ä¸Šå’Œæ—¶åŸŸå‚è€ƒå—levelè°ƒæ•´DEPTHèŒƒå›´ï¼Œå…¨Iå¸§ä¹Ÿé€‚ç”¨ */
     OPT_CU_QSFD              ,        /* CU splitting termination based on RD-Cost:
                                          Z. Wang, R. Wang, K. Fan, H. Sun, and W. Gao,
-                                         ¡°uAVS2¡ªFast encoder for the 2nd generation IEEE 1857 video coding standard,¡±
-                                         Signal Process. Image Commun., vol. 53, no. October 2016, pp. 13¨C23, 2017. */
+                                         â€œuAVS2â€”Fast encoder for the 2nd generation IEEE 1857 video coding standard,â€
+                                         Signal Process. Image Commun., vol. 53, no. October 2016, pp. 13â€“23, 2017. */
 
     /* fast transform and Quant */
-    OPT_BYPASS_INTRA_RDOQ    ,        /* Ìø¹ýBÖ¡Ö¡¼ä±àÂëÖÐµÄÖ¡ÄÚÄ£Ê½µÄRDOQ */
-    OPT_RDOQ_AZPC            ,        /* Í¨¹ý¶Ô±ä»»ÏµÊýµÄãÐÖµÅÐ¶Ï¼ì²âÈ«Áã¿é½øÐÐRDOQÔ¤´¦Àí£¬Ìø¹ýÉ«¶È·ÖÁ¿µÄRDOQ¹ý³Ì*/
+    OPT_BYPASS_INTRA_RDOQ    ,        /* è·³è¿‡Bå¸§å¸§é—´ç¼–ç ä¸­çš„å¸§å†…æ¨¡å¼çš„RDOQ */
+    OPT_RDOQ_AZPC            ,        /* é€šè¿‡å¯¹å˜æ¢ç³»æ•°çš„é˜ˆå€¼åˆ¤æ–­æ£€æµ‹å…¨é›¶å—è¿›è¡ŒRDOQé¢„å¤„ç†ï¼Œè·³è¿‡è‰²åº¦åˆ†é‡çš„RDOQè¿‡ç¨‹*/
 
     /* others */
-    OPT_FAST_ZBLOCK          ,        /* ¿ìËÙÁã¿é¹À¼Æ */
-    OPT_TR_KEY_FRAME_MD      ,        /* ÒÔ¸ü´ó¸ÅÂÊÌø¹ý·Ç¹Ø¼üÖ¡µÄ²¿·ÖÄ£Ê½£¬ÄÜ½ÚÊ¡5%ÒÔÉÏÊ±¼ä */
-    OPT_CODE_OPTIMZATION     ,        /* OPT_CU_SUBCU_COST: ÏÈ±àÂë´óCU£¬ÔÙ±àÂëÐ¡CUÊ±ÈôÇ°¼¸¸öÐ¡CUµÄRDCost³¬¹ý´óCUµÄÒ»¶¨±ÈÂÊÔòÌø¹ýºóÐøCU
-                                       * OPT_RDOQ_SKIP:     Í¨¹ýÔÚRDOQÖ®Ç°¶Ô±ä»»ÏµÊýµÄãÐÖµÅÐ¶Ï¼ì²âÈ«Áã¿é£¬Ìø¹ýRDOQ¹ý³Ì
+    OPT_FAST_ZBLOCK          ,        /* å¿«é€Ÿé›¶å—ä¼°è®¡ */
+    OPT_TR_KEY_FRAME_MD      ,        /* ä»¥æ›´å¤§æ¦‚çŽ‡è·³è¿‡éžå…³é”®å¸§çš„éƒ¨åˆ†æ¨¡å¼ï¼Œèƒ½èŠ‚çœ5%ä»¥ä¸Šæ—¶é—´ */
+    OPT_CODE_OPTIMZATION     ,        /* OPT_CU_SUBCU_COST: å…ˆç¼–ç å¤§CUï¼Œå†ç¼–ç å°CUæ—¶è‹¥å‰å‡ ä¸ªå°CUçš„RDCostè¶…è¿‡å¤§CUçš„ä¸€å®šæ¯”çŽ‡åˆ™è·³è¿‡åŽç»­CU
+                                       * OPT_RDOQ_SKIP:     é€šè¿‡åœ¨RDOQä¹‹å‰å¯¹å˜æ¢ç³»æ•°çš„é˜ˆå€¼åˆ¤æ–­æ£€æµ‹å…¨é›¶å—ï¼Œè·³è¿‡RDOQè¿‡ç¨‹
                                        */
-    OPT_BIT_EST_PSZT         ,        /* ¿ìËÙTU±ÈÌØ¹À¼Æ£º¶Ô33x32µÄÁÁ¶ÈTU¼Ù¶¨Ö»ÓÐµÍÆµµÄ16x16²¿·ÖÓÐ·ÇÁãÏµÊý */
-    OPT_TU_LEVEL_DEC         ,        /* TUÁ½²ã»®·Ö¾ö²ß£º¶ÔµÚÒ»²ãTU»®·ÖÑ¡³ö×îÓÅ£¬¶Ô×îÓÅ×öµÚ¶þ²ãTU»®·Ö£¬¾ö²ßÊÇ·ñÐèÒªÁ½²ãTU»®·Ö */
-    OPT_FAST_ALF             ,        /* ALF¿ìËÙËã·¨£¬ÔÚ¶¥²ãBÖ¡£¨²»±»ÆäÓàÖ¡²Î¿¼£©½ûÓÃALF£¬ÔÚËùÓÐALFµÄÐ­·½²î¾ØÕó¼ÆËãÊ±£¬½øÐÐstep=2µÄÏÂ²ÉÑù */
-    OPT_FAST_SAO             ,        /* SAO¿ìËÙËã·¨£¬ÔÚ¶¥²ãBÖ¡£¨²»±»ÆäÓàÖ¡²Î¿¼£©½ûÓÃSAO */
-    OPT_SUBCU_SPLIT          ,        /* ¸ù¾Ý»®·Ö×Ó¿éµÄÊýÄ¿¾ö²ß¸¸¿éÊÇ·ñ¶Ô·ÇSKIPÄ£Ê½×öRDO */
-    OPT_PU_RMS               ,        /* ¹Ø±ÕÐ¡¿é£¨8x8,16x16)»®·ÖµÄÔ¤²âµ¥Ôª£¬½ö±£Áô2Nx2NµÄÖ¡ÄÚ£¬Ö¡¼äÒÔ¼°SKIPÄ£Ê½*/
-    NUM_FAST_ALGS                     /* ×ÜµÄ¿ìËÙËã·¨ÊýÁ¿ */
+    OPT_BIT_EST_PSZT         ,        /* å¿«é€ŸTUæ¯”ç‰¹ä¼°è®¡ï¼šå¯¹33x32çš„äº®åº¦TUå‡å®šåªæœ‰ä½Žé¢‘çš„16x16éƒ¨åˆ†æœ‰éžé›¶ç³»æ•° */
+    OPT_TU_LEVEL_DEC         ,        /* TUä¸¤å±‚åˆ’åˆ†å†³ç­–ï¼šå¯¹ç¬¬ä¸€å±‚TUåˆ’åˆ†é€‰å‡ºæœ€ä¼˜ï¼Œå¯¹æœ€ä¼˜åšç¬¬äºŒå±‚TUåˆ’åˆ†ï¼Œå†³ç­–æ˜¯å¦éœ€è¦ä¸¤å±‚TUåˆ’åˆ† */
+    OPT_FAST_ALF             ,        /* ALFå¿«é€Ÿç®—æ³•ï¼Œåœ¨é¡¶å±‚Bå¸§ï¼ˆä¸è¢«å…¶ä½™å¸§å‚è€ƒï¼‰ç¦ç”¨ALFï¼Œåœ¨æ‰€æœ‰ALFçš„åæ–¹å·®çŸ©é˜µè®¡ç®—æ—¶ï¼Œè¿›è¡Œstep=2çš„ä¸‹é‡‡æ · */
+    OPT_FAST_SAO             ,        /* SAOå¿«é€Ÿç®—æ³•ï¼Œåœ¨é¡¶å±‚Bå¸§ï¼ˆä¸è¢«å…¶ä½™å¸§å‚è€ƒï¼‰ç¦ç”¨SAO */
+    OPT_SUBCU_SPLIT          ,        /* æ ¹æ®åˆ’åˆ†å­å—çš„æ•°ç›®å†³ç­–çˆ¶å—æ˜¯å¦å¯¹éžSKIPæ¨¡å¼åšRDO */
+    OPT_PU_RMS               ,        /* å…³é—­å°å—ï¼ˆ8x8,16x16)åˆ’åˆ†çš„é¢„æµ‹å•å…ƒï¼Œä»…ä¿ç•™2Nx2Nçš„å¸§å†…ï¼Œå¸§é—´ä»¥åŠSKIPæ¨¡å¼*/
+    NUM_FAST_ALGS                     /* æ€»çš„å¿«é€Ÿç®—æ³•æ•°é‡ */
 };
 
 
 /* ---------------------------------------------------------------------------
  * const defines related with fast algorithms
  */
-#define SAVE_CU_INFO            1     /* ±£´æ²Î¿¼Ö¡¶ÓÁÐÀïµÄÃ¿Ò»Ö¡µÄcu typeºÍcu bitsize£¬ÓÃÓÚ»ñÈ¡Ê±ÓòµÄcuÄ£Ê½ºÍcu³ß´ç */
+#define SAVE_CU_INFO            1     /* ä¿å­˜å‚è€ƒå¸§é˜Ÿåˆ—é‡Œçš„æ¯ä¸€å¸§çš„cu typeå’Œcu bitsizeï¼Œç”¨äºŽèŽ·å–æ—¶åŸŸçš„cuæ¨¡å¼å’Œcuå°ºå¯¸ */
 #define NUM_INTRA_C_FULL_RD     4
 
 /* ---------------------------------------------------------------------------
@@ -144,7 +144,7 @@ enum xavs2_fast_algorithms_e {
  */
 #define ENABLE_RATE_CONTROL_CU  0     /* Enable Rate-Control on CU level: 1: enable, 0: disable */
 
-#define ENABLE_AUTO_INIT_QP     1     /* ¸ù¾ÝÄ¿±êÂëÂÊ×Ô¶¯ÉèÖÃ³õÊ¼QPÖµ */
+#define ENABLE_AUTO_INIT_QP     1     /* æ ¹æ®ç›®æ ‡ç çŽ‡è‡ªåŠ¨è®¾ç½®åˆå§‹QPå€¼ */
 
 
 /**
@@ -224,16 +224,16 @@ enum xavs2_fast_algorithms_e {
 #define LAM_2Level_TU           0.8
 #define DMH_MODE_NUM            5     /* number of DMH mode */
 #define WPM_NUM                 3     /* number of WPM */
-#define TH_PMVR                 2     /* PMVRÖÐËÄ·ÖÖ®Ò»ÏñËØ¾«¶ÈMVµÄ¿ÉÓÃ·¶Î§ */
+#define TH_PMVR                 2     /* PMVRä¸­å››åˆ†ä¹‹ä¸€åƒç´ ç²¾åº¦MVçš„å¯ç”¨èŒƒå›´ */
 
 
 /* ---------------------------------------------------------------------------
  * coefficient coding
  */
-#define MAX_TU_SIZE             32    /* ×î´ó±ä»»¿é´óÐ¡£¬ìØ±àÂëÊ±µÄÏµÊý¾ØÕó */
-#define MAX_TU_SIZE_IN_BIT      5     /* ×î´ó±ä»»¿é´óÐ¡£¬ìØ±àÂëÊ±µÄÏµÊý¾ØÕó */
-#define SIZE_CG                 4     /* CG ´óÐ¡ 4x4 */
-#define SIZE_CG_IN_BIT          2     /* CG ´óÐ¡ 4x4 */
+#define MAX_TU_SIZE             32    /* æœ€å¤§å˜æ¢å—å¤§å°ï¼Œç†µç¼–ç æ—¶çš„ç³»æ•°çŸ©é˜µ */
+#define MAX_TU_SIZE_IN_BIT      5     /* æœ€å¤§å˜æ¢å—å¤§å°ï¼Œç†µç¼–ç æ—¶çš„ç³»æ•°çŸ©é˜µ */
+#define SIZE_CG                 4     /* CG å¤§å° 4x4 */
+#define SIZE_CG_IN_BIT          2     /* CG å¤§å° 4x4 */
 #define MAX_CG_NUM_IN_TU        (1 << ((MAX_TU_SIZE_IN_BIT - SIZE_CG_IN_BIT) << 1))
 
 /* ---------------------------------------------------------------------------
@@ -247,14 +247,14 @@ enum xavs2_fast_algorithms_e {
 /* ---------------------------------------------------------------------------
  * SAO (Sample Adaptive Offset)
  */
-#define NUM_BO_OFFSET                 32                            /*BOÄ£Ê½ÏÂoffsetÊýÁ¿£¬ÆäÖÐ×î¶à4¸ö·ÇÁã*/
-#define MAX_NUM_SAO_CLASSES           32                            /*×î´óoffsetÊýÁ¿*/
+#define NUM_BO_OFFSET                 32                            /*BOæ¨¡å¼ä¸‹offsetæ•°é‡ï¼Œå…¶ä¸­æœ€å¤š4ä¸ªéžé›¶*/
+#define MAX_NUM_SAO_CLASSES           32                            /*æœ€å¤§offsetæ•°é‡*/
 #define NUM_SAO_BO_CLASSES_LOG2       5                             /**/
 #define NUM_SAO_BO_CLASSES_IN_BIT     5                             /**/
-#define NUM_SAO_BO_CLASSES           (1 << NUM_SAO_BO_CLASSES_LOG2) /*BOÄ£Ê½ÏÂstartbandÊýÄ¿*/
-#define SAO_RATE_THR                  1.0                          /*ÁÁ¶È·ÖÁ¿£¬ÓÃÓÚRDO¾ö²ß*/
-#define SAO_RATE_CHROMA_THR           1.0                          /*É«¶È·ÖÁ¿£¬ÓÃÓÚRDO¾ö²ß*/
-#define SAO_SHIFT_PIX_NUM             4                             /*SAOÏò×óÉÏÆ«ÒÆµÄÏñËØµãÊý*/
+#define NUM_SAO_BO_CLASSES           (1 << NUM_SAO_BO_CLASSES_LOG2) /*BOæ¨¡å¼ä¸‹startbandæ•°ç›®*/
+#define SAO_RATE_THR                  1.0                          /*äº®åº¦åˆ†é‡ï¼Œç”¨äºŽRDOå†³ç­–*/
+#define SAO_RATE_CHROMA_THR           1.0                          /*è‰²åº¦åˆ†é‡ï¼Œç”¨äºŽRDOå†³ç­–*/
+#define SAO_SHIFT_PIX_NUM             4                             /*SAOå‘å·¦ä¸Šåç§»çš„åƒç´ ç‚¹æ•°*/
 
 
 #define MAX_DOUBLE              1.7e+308
@@ -302,7 +302,7 @@ enum xavs2_fast_algorithms_e {
 #define MAX_SLICES                8   /* max number of slices in one picture */
 #define MAX_PARALLEL_FRAMES       8   /* max number of parallel encoding frames */
 #define MAX_COI_VALUE   ((1<<8) - 1)  /* max COI value (unsigned char) */
-#define PIXEL_MAX ((1<<BIT_DEPTH)-1)  /* max value of a pixel */
+//#define PIXEL_MAX ((1<<8)-1)  /* max value of a pixel */
 
 
 /* ---------------------------------------------------------------------------
diff --git a/source/common/filter_alf.c b/source/common/filter_alf.c
index 203a9c0..626b007 100644
--- a/source/common/filter_alf.c
+++ b/source/common/filter_alf.c
@@ -49,23 +49,30 @@
 /* ---------------------------------------------------------------------------
  */
 static
-void alf_filter_block1(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+void alf_filter8_block1(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
                        int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
                        int *alf_coeff, int b_top_avail, int b_down_avail)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     const int pel_add  = 1 << (ALF_NUM_BIT_SHIFT - 1);
+    const int pel_max  = (1 << h->param->input_sample_bit_depth) - 1;
+
     int startPos = b_top_avail  ? (lcu_pix_y - 4) : lcu_pix_y;
     int endPos   = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height);
-    int xPosEnd  = lcu_pix_x + lcu_width;
-    int min_x    = lcu_pix_x - 3;
-    int max_x    = xPosEnd - 1 + 3;
+    int min_x, max_x, xPosEnd;
+    min_x    = -3;
+    max_x    = lcu_width - 1 + 3;
     int yUp, yBottom;
     int xLeft, xRight;
     int x, y, pel_val;
-    pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
+    pel8_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
 
-    p_src += (startPos * i_src);
-    p_dst += (startPos * i_dst);
+    lcu_height = endPos - startPos;
+    lcu_height--;
+
+    p_src += (startPos * i_src) + lcu_pix_x;
+    p_dst += (startPos * i_dst) + lcu_pix_x;
 
     for (y = startPos; y < endPos; y++) {
         yUp     = XAVS2_CLIP3(startPos, endPos - 1, y - 1);
@@ -104,22 +111,94 @@ void alf_filter_block1(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             pel_val += alf_coeff[8] * (p_src [x     ]);
 
             pel_val   = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT;
-            p_dst[x] = (pel_t)XAVS2_CLIP1(pel_val);
+            p_dst[x] = (pel8_t)XAVS2_CLIP1(pel_val);
+        }
+        p_src += i_src;
+        p_dst += i_dst;
+    }
+#undef XAVS2_CLIP1
+}
+
+static
+void alf_filter10_block1(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
+                       int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+                       int *alf_coeff, int b_top_avail, int b_down_avail)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    const int pel_add  = 1 << (ALF_NUM_BIT_SHIFT - 1);
+    const int pel_max  = (1 << h->param->input_sample_bit_depth) - 1;
+
+    int startPos = b_top_avail  ? (lcu_pix_y - 4) : lcu_pix_y;
+    int endPos   = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height);
+    int min_x, max_x, xPosEnd;
+    xPosEnd  = lcu_pix_x + lcu_width;
+    min_x    = lcu_pix_x - 3;
+    max_x    = xPosEnd - 1 + 3;
+    int yUp, yBottom;
+    int xLeft, xRight;
+    int x, y, pel_val;
+    pel10_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
+
+    p_src += (startPos * i_src);
+    p_dst += (startPos * i_dst);
+
+    for (y = 0; y <= lcu_height; y++) {
+        yUp     = XAVS2_CLIP3(startPos, endPos - 1, y - 1);
+        yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 1);
+        p_src1 = p_src + (yBottom - y) * i_src;
+        p_src2 = p_src + (yUp     - y) * i_src;
+
+        yUp     = XAVS2_CLIP3(startPos, endPos - 1, y - 2);
+        yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 2);
+        p_src3 = p_src + (yBottom - y) * i_src;
+        p_src4 = p_src + (yUp     - y) * i_src;
+
+        yUp     = XAVS2_CLIP3(startPos, endPos - 1, y - 3);
+        yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 3);
+        p_src5 = p_src + (yBottom - y) * i_src;
+        p_src6 = p_src + (yUp     - y) * i_src;
+
+        for (x = 0; x < lcu_width; x++) {
+            pel_val  = alf_coeff[0] * (p_src5[x] + p_src6[x]);
+            pel_val += alf_coeff[1] * (p_src3[x] + p_src4[x]);
+
+            xLeft    = XAVS2_CLIP3(min_x, max_x, x - 1);
+            xRight   = XAVS2_CLIP3(min_x, max_x, x + 1);
+            pel_val += alf_coeff[2] * (p_src1[xRight] + p_src2[xLeft ]);
+            pel_val += alf_coeff[3] * (p_src1[x     ] + p_src2[x     ]);
+            pel_val += alf_coeff[4] * (p_src1[xLeft ] + p_src2[xRight]);
+            pel_val += alf_coeff[7] * (p_src [xRight] + p_src [xLeft ]);
+
+            xLeft    = XAVS2_CLIP3(min_x, max_x, x - 2);
+            xRight   = XAVS2_CLIP3(min_x, max_x, x + 2);
+            pel_val += alf_coeff[6] * (p_src [xRight] + p_src [xLeft ]);
+
+            xLeft    = XAVS2_CLIP3(min_x, max_x, x - 3);
+            xRight   = XAVS2_CLIP3(min_x, max_x, x + 3);
+            pel_val += alf_coeff[5] * (p_src [xRight] + p_src [xLeft ]);
+            pel_val += alf_coeff[8] * (p_src [x     ]);
+
+            pel_val   = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT;
+            p_dst[x] = (pel10_t)XAVS2_CLIP3(0, pel_max, pel_val);
         }
 
         p_src += i_src;
         p_dst += i_dst;
     }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static
-void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+void alf_filter8_block2(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
                        int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
                        int *alf_coeff, int b_top_avail, int b_down_avail)
 {
-    pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    pel8_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
     int pixelInt;
     int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y;
     int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height);
@@ -147,7 +226,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
         pixelInt += alf_coeff[8] * (p_src [ 0]);
 
         pixelInt = (int)((pixelInt + 32) >> 6);
-        p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt);
+        p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt);
     }
 
     p_src += lcu_width - 1;
@@ -172,7 +251,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
         pixelInt += alf_coeff[8] * (p_src [ 0]);
 
         pixelInt = (int)((pixelInt + 32) >> 6);
-        p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt);
+        p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt);
     }
 
     /* last line */
@@ -200,7 +279,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
         pixelInt += alf_coeff[8] * (p_src [ 0]);
 
         pixelInt = (int)((pixelInt + 32) >> 6);
-        p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt);
+        p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt);
     }
 
     p_src += lcu_width - 1;
@@ -225,22 +304,154 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
         pixelInt += alf_coeff[8] * (p_src [ 0]);
 
         pixelInt = (int)((pixelInt + 32) >> 6);
-        p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt);
+        p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt);
     }
+#undef XAVS2_CLIP1
+}
+
+static
+void alf_filter10_block2(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
+                       int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+                       int *alf_coeff, int b_top_avail, int b_down_avail)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    pel10_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6;
+    int pixelInt;
+    int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y;
+    int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height);
+
+    /* first line */
+    p_src += (startPos * i_src) + lcu_pix_x;
+    p_dst += (startPos * i_dst) + lcu_pix_x;
+
+    if (p_src[0] != p_src[-1]) {
+        p_src1 = p_src + 1 * i_src;
+        p_src2 = p_src;
+        p_src3 = p_src + 2 * i_src;
+        p_src4 = p_src;
+        p_src5 = p_src + 3 * i_src;
+        p_src6 = p_src;
+
+        pixelInt  = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]);
+        pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]);
+        pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[ 0]);
+        pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]);
+        pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]);
+        pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]);
+        pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]);
+        pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]);
+        pixelInt += alf_coeff[8] * (p_src [ 0]);
+
+        pixelInt = (int)((pixelInt + 32) >> 6);
+        p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt);
+    }
+
+    p_src += lcu_width - 1;
+    p_dst += lcu_width - 1;
+
+    if (p_src[0] != p_src[1]) {
+        p_src1 = p_src + 1 * i_src;
+        p_src2 = p_src;
+        p_src3 = p_src + 2 * i_src;
+        p_src4 = p_src;
+        p_src5 = p_src + 3 * i_src;
+        p_src6 = p_src;
+
+        pixelInt  = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]);
+        pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]);
+        pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]);
+        pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]);
+        pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 0]);
+        pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]);
+        pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]);
+        pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]);
+        pixelInt += alf_coeff[8] * (p_src [ 0]);
+
+        pixelInt = (int)((pixelInt + 32) >> 6);
+        p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt);
+    }
+
+    /* last line */
+    p_src -= lcu_width - 1;
+    p_dst -= lcu_width - 1;
+    p_src += ((endPos - startPos - 1) * i_src);
+    p_dst += ((endPos - startPos - 1) * i_dst);
+
+    if (p_src[0] != p_src[-1]) {
+        p_src1 = p_src;
+        p_src2 = p_src - 1 * i_src;
+        p_src3 = p_src;
+        p_src4 = p_src - 2 * i_src;
+        p_src5 = p_src;
+        p_src6 = p_src - 3 * i_src;
+
+        pixelInt  = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]);
+        pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]);
+        pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]);
+        pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]);
+        pixelInt += alf_coeff[4] * (p_src1[ 0] + p_src2[ 1]);
+        pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]);
+        pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]);
+        pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]);
+        pixelInt += alf_coeff[8] * (p_src [ 0]);
+
+        pixelInt = (int)((pixelInt + 32) >> 6);
+        p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt);
+    }
+
+    p_src += lcu_width - 1;
+    p_dst += lcu_width - 1;
+
+    if (p_src[0] != p_src[1]) {
+        p_src1 = p_src;
+        p_src2 = p_src - 1 * i_src;
+        p_src3 = p_src;
+        p_src4 = p_src - 2 * i_src;
+        p_src5 = p_src;
+        p_src6 = p_src - 3 * i_src;
+
+        pixelInt  = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]);
+        pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]);
+        pixelInt += alf_coeff[2] * (p_src1[ 0] + p_src2[-1]);
+        pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]);
+        pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]);
+        pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]);
+        pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]);
+        pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]);
+        pixelInt += alf_coeff[8] * (p_src [ 0]);
+
+        pixelInt = (int)((pixelInt + 32) >> 6);
+        p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt);
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_alf_init(uint32_t cpuid, intrinsic_func_t *pf)
+void xavs2_alf_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf)
 {
+    if (param->input_sample_bit_depth == 8) {
     /* set function handles */
-    pf->alf_flt[0] = alf_filter_block1;
-    pf->alf_flt[1] = alf_filter_block2;
+    pf->alf_flt8[0] = alf_filter8_block1;
+    pf->alf_flt8[1] = alf_filter8_block2;
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_SSE42) {
-        pf->alf_flt[0] = alf_flt_one_block_sse128;
+        pf->alf_flt8[0] = alf_flt_one_block_sse128;
     }
 #else
     UNUSED_PARAMETER(cpuid);
 #endif
+    } else {
+    /* set function handles */
+    pf->alf_flt10[0] = alf_filter10_block1;
+    pf->alf_flt10[1] = alf_filter10_block2;
+#if HAVE_MMX
+    if (cpuid & XAVS2_CPU_SSE42) {
+        pf->alf_flt10[0] = alf_flt_one_block_sse128;
+    }
+#else
+    UNUSED_PARAMETER(cpuid);
+#endif
+    }
 }
diff --git a/source/common/filter_deblock.c b/source/common/filter_deblock.c
index 55edc37..7434511 100644
--- a/source/common/filter_deblock.c
+++ b/source/common/filter_deblock.c
@@ -290,7 +290,7 @@ uint8_t lf_skip_filter(xavs2_t *h, cu_info_t *MbP, cu_info_t *MbQ, int dir, int
 
 /* ---------------------------------------------------------------------------
  */
-static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag)
+static void lf_edge_core8(pel8_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag)
 {
     int pel;
     int abs_delta;
@@ -347,26 +347,114 @@ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int al
 
             switch (fs) {
             case 4:
-                src[-inc1] = (pel_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5);   // L0
-                src[-inc2] = (pel_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4);           // L1
-                src[-inc3] = (pel_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3);                                   // L2
-                src[    0] = (pel_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5);   // R0
-                src[ inc1] = (pel_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4);           // R1
-                src[ inc2] = (pel_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3);                                   // R2
+                src[-inc1] = (pel8_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5);   // L0
+                src[-inc2] = (pel8_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4);           // L1
+                src[-inc3] = (pel8_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3);                                   // L2
+                src[    0] = (pel8_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5);   // R0
+                src[ inc1] = (pel8_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4);           // R1
+                src[ inc2] = (pel8_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3);                                   // R2
                 break;
             case 3:
-                src[-inc1] = (pel_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4);   // L0
-                src[    0] = (pel_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4);   // R0
-                src[-inc2] = (pel_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4);
-                src[ inc1] = (pel_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4);
+                src[-inc1] = (pel8_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4);   // L0
+                src[    0] = (pel8_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4);   // R0
+                src[-inc2] = (pel8_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4);
+                src[ inc1] = (pel8_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4);
                 break;
             case 2:
-                src[-inc1] = (pel_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4);
-                src[    0] = (pel_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4);
+                src[-inc1] = (pel8_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4);
+                src[    0] = (pel8_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4);
                 break;
             case 1:
-                src[-inc1] = (pel_t)((L0 * 3 + R0 + 2) >> 2);
-                src[    0] = (pel_t)((R0 * 3 + L0 + 2) >> 2);
+                src[-inc1] = (pel8_t)((L0 * 3 + R0 + 2) >> 2);
+                src[    0] = (pel8_t)((R0 * 3 + L0 + 2) >> 2);
+                break;
+            default:
+                break;
+            }
+        }
+
+        src += ptr_inc;    // next row or column
+        pel += b_chroma;
+    }
+}
+
+static void lf_edge_core10(pel10_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag)
+{
+    int pel;
+    int abs_delta;
+    int L2, L1, L0, R0, R1, R2;
+    int fs; // fs stands for filtering strength. The larger fs is, the stronger filter is applied.
+    int FlatnessL, FlatnessR;
+    int inc2, inc3;
+    int flag = 0;
+
+    inc2 = inc1 << 1;
+    inc3 = inc1 + inc2;
+    for (pel = 0; pel < MIN_CU_SIZE; pel++) {
+        L2 = src[-inc3];
+        L1 = src[-inc2];
+        L0 = src[-inc1];
+        R0 = src[    0];
+        R1 = src[ inc1];
+        R2 = src[ inc2];
+
+        abs_delta = XAVS2_ABS(R0 - L0);
+        flag = (pel < 4) ? flt_flag[0] : flt_flag[1];
+        if (flag && (abs_delta < alpha) && (abs_delta > 1)) {
+            FlatnessL = (XAVS2_ABS(L1 - L0) < beta) ? 2 : 0;
+            if (XAVS2_ABS(L2 - L0) < beta) {
+                FlatnessL += 1;
+            }
+
+            FlatnessR = (XAVS2_ABS(R0 - R1) < beta) ? 2 : 0;
+            if (XAVS2_ABS(R0 - R2) < beta) {
+                FlatnessR += 1;
+            }
+
+            switch (FlatnessL + FlatnessR) {
+            case 6:
+                fs = (R1 == R0 && L0 == L1) ? 4 : 3;
+                break;
+            case 5:
+                fs = (R1 == R0 && L0 == L1) ? 3 : 2;
+                break;
+            case 4:
+                fs = (FlatnessL == 2) ? 2 : 1;
+                break;
+            case 3:
+                fs = (XAVS2_ABS(L1 - R1) < beta) ? 1 : 0;
+                break;
+            default:
+                fs = 0;
+                break;
+            }
+
+            if (b_chroma && fs > 0) {
+                fs--;
+            }
+
+            switch (fs) {
+            case 4:
+                src[-inc1] = (pel10_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5);   // L0
+                src[-inc2] = (pel10_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4);           // L1
+                src[-inc3] = (pel10_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3);                                   // L2
+                src[    0] = (pel10_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5);   // R0
+                src[ inc1] = (pel10_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4);           // R1
+                src[ inc2] = (pel10_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3);                                   // R2
+                break;
+            case 3:
+                src[-inc1] = (pel10_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4);   // L0
+                src[    0] = (pel10_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4);   // R0
+                src[-inc2] = (pel10_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4);
+                src[ inc1] = (pel10_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4);
+                break;
+            case 2:
+                src[-inc1] = (pel10_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4);
+                src[    0] = (pel10_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4);
+                break;
+            case 1:
+                src[-inc1] = (pel10_t)((L0 * 3 + R0 + 2) >> 2);
+                src[    0] = (pel10_t)((R0 * 3 + L0 + 2) >> 2);
                 break;
             default:
                 break;
@@ -380,45 +468,113 @@ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int al
 
 /* ---------------------------------------------------------------------------
  */
-static void deblock_edge_hor(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
+static void deblock_edge_hor8(xavs2_t *h, pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
 {
-    lf_edge_core(src, 0, 1, stride, alpha, beta, flt_flag);
+    lf_edge_core8(src, 0, 1, stride, alpha, beta, flt_flag);
+}
+
+static void deblock_edge_hor10(xavs2_t *h, pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
+{
+    lf_edge_core10(src, 0, 1, stride, alpha, beta, flt_flag);
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void deblock_edge_ver(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
+static void deblock_edge_ver8(xavs2_t *h, pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
 {
-    lf_edge_core(src, 0, stride, 1, alpha, beta, flt_flag);
+    lf_edge_core8(src, 0, stride, 1, alpha, beta, flt_flag);
+}
+
+static void deblock_edge_ver10(xavs2_t *h, pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag)
+{
+    lf_edge_core10(src, 0, stride, 1, alpha, beta, flt_flag);
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void deblock_edge_ver_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
+static void deblock_edge_ver8_c(xavs2_t *h, pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
+{
+    lf_edge_core8(src_u, 1, stride, 1, alpha, beta, flt_flag);
+    lf_edge_core8(src_v, 1, stride, 1, alpha, beta, flt_flag);
+}
+
+static void deblock_edge_ver10_c(xavs2_t *h, pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
 {
-    lf_edge_core(src_u, 1, stride, 1, alpha, beta, flt_flag);
-    lf_edge_core(src_v, 1, stride, 1, alpha, beta, flt_flag);
+    lf_edge_core10(src_u, 1, stride, 1, alpha, beta, flt_flag);
+    lf_edge_core10(src_v, 1, stride, 1, alpha, beta, flt_flag);
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void deblock_edge_hor_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
+static void deblock_edge_hor8_c(xavs2_t *h, pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
 {
-    lf_edge_core(src_u, 1, 1, stride, alpha, beta, flt_flag);
-    lf_edge_core(src_v, 1, 1, stride, alpha, beta, flt_flag);
+    lf_edge_core8(src_u, 1, 1, stride, alpha, beta, flt_flag);
+    lf_edge_core8(src_v, 1, 1, stride, alpha, beta, flt_flag);
+}
+
+static void deblock_edge_hor10_c(xavs2_t *h, pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag)
+{
+    lf_edge_core10(src_u, 1, 1, stride, alpha, beta, flt_flag);
+    lf_edge_core10(src_v, 1, 1, stride, alpha, beta, flt_flag);
 }
 
 /* ---------------------------------------------------------------------------
  */
 static
-void lf_scu_deblock(xavs2_t *h, pel_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir)
+void lf_scu_deblock8(xavs2_t *h, pel8_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir)
+{
+#define MAX_QP_DEBLOCK  MAX_QP
+    cu_info_t *MbQ = &h->cu_info[scu_y * h->i_width_in_mincu + scu_x];  /* current SCU */
+    int edge_type = h->p_deblock_flag[dir][(scu_y - h->lcu.i_scu_y) * h->i_width_in_mincu + scu_x];
+
+    if (edge_type != EDGE_TYPE_NOFILTER) {
+        pel8_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT);
+        cu_info_t *MbP = dir ? (MbQ - h->i_width_in_mincu) : (MbQ - 1); /* MbP = Mb of the remote 4x4 block */
+        int QP = (cu_get_qp(h, MbP) + cu_get_qp(h, MbQ) + 1) >> 1;                /* average QP of the two blocks */
+        int shift = h->param->sample_bit_depth - 8;
+        int offset = shift << 3;  /* coded as 10/12 bit, QP is added by (8 * (h->param->sample_bit_depth - 8)) in config file */
+        int alpha, beta;
+        uint8_t b_filter_edge[2];
+
+        b_filter_edge[0] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1), (scu_y << 1));
+        b_filter_edge[1] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1) + dir, (scu_y << 1) + !dir);
+
+        if (b_filter_edge[0] == 0 && b_filter_edge[1] == 0) {
+            return;
+        }
+
+        /* deblock luma edge */
+        alpha = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->alpha_c_offset)] << shift;
+        beta  = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->beta_offset)] << shift;
+
+        g_funcs.deblock_luma8[dir](h, src_y, i_stride, alpha, beta, b_filter_edge);
+
+        assert(h->param->chroma_format == CHROMA_420 || h->param->chroma_format == CHROMA_400);   /* only support I420/I400 now */
+        /* deblock chroma edge */
+        if (edge_type == EDGE_TYPE_BOTH && h->param->chroma_format == CHROMA_420)
+            if ((((scu_y & 1) == 0) && dir) || (((scu_x & 1) == 0) && (!dir))) {
+                pel8_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
+                pel8_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
+
+                int alpha_c, beta_c;
+                QP = cu_get_chroma_qp(h, QP, 0) - offset;
+                alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->alpha_c_offset)] << shift;
+                beta_c  = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->beta_offset)] << shift;
+                g_funcs.deblock_chroma8[dir](h, src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge);
+            }
+    }
+#undef MAX_QP_DEBLOCK
+}
+
+static
+void lf_scu_deblock10(xavs2_t *h, pel10_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir)
 {
-    static const int max_qp_deblock = 63;
+#define MAX_QP_DEBLOCK  (MAX_QP + (h->param->sample_bit_depth - 8) * 8)
     cu_info_t *MbQ = &h->cu_info[scu_y * h->i_width_in_mincu + scu_x];  /* current SCU */
     int edge_type = h->p_deblock_flag[dir][(scu_y - h->lcu.i_scu_y) * h->i_width_in_mincu + scu_x];
 
     if (edge_type != EDGE_TYPE_NOFILTER) {
-        pel_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT);
+        pel10_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT);
         cu_info_t *MbP = dir ? (MbQ - h->i_width_in_mincu) : (MbQ - 1); /* MbP = Mb of the remote 4x4 block */
         int QP = (cu_get_qp(h, MbP) + cu_get_qp(h, MbQ) + 1) >> 1;                /* average QP of the two blocks */
         int shift = h->param->sample_bit_depth - 8;
@@ -434,25 +590,26 @@ void lf_scu_deblock(xavs2_t *h, pel_t *p_rec[3], int i_stride, int i_stride_c, i
         }
 
         /* deblock luma edge */
-        alpha = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->alpha_c_offset)] << shift;
-        beta  = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->beta_offset)] << shift;
+        alpha = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->alpha_c_offset)] << shift;
+        beta  = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->beta_offset)] << shift;
 
-        g_funcs.deblock_luma[dir](src_y, i_stride, alpha, beta, b_filter_edge);
+        g_funcs.deblock_luma10[dir](h, src_y, i_stride, alpha, beta, b_filter_edge);
 
         assert(h->param->chroma_format == CHROMA_420 || h->param->chroma_format == CHROMA_400);   /* only support I420/I400 now */
         /* deblock chroma edge */
         if (edge_type == EDGE_TYPE_BOTH && h->param->chroma_format == CHROMA_420)
             if ((((scu_y & 1) == 0) && dir) || (((scu_x & 1) == 0) && (!dir))) {
-                pel_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
-                pel_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
+                pel10_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
+                pel10_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1));
 
                 int alpha_c, beta_c;
                 QP = cu_get_chroma_qp(h, QP, 0) - offset;
-                alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->alpha_c_offset)] << shift;
-                beta_c  = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->beta_offset)] << shift;
-                g_funcs.deblock_chroma[dir](src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge);
+                alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->alpha_c_offset)] << shift;
+                beta_c  = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->beta_offset)] << shift;
+                g_funcs.deblock_chroma10[dir](h, src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge);
             }
     }
+#undef MAX_QP_DEBLOCK
 }
 
 /**
@@ -491,7 +648,11 @@ void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm)
     /* deblock all vertical edges in one LCU */
     for (j = 0; j < num_of_scu_ver; j++) {
         for (i = 0; i < num_of_scu_hor; i++) {
-            lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER);
+            if (h->param->input_sample_bit_depth == 8) {
+            lf_scu_deblock8(h, frm->planes8, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER);
+            } else {
+            lf_scu_deblock10(h, frm->planes10, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER);
+            }
         }
     }
 
@@ -512,35 +673,70 @@ void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm)
     /* deblock all horizontal edges in one LCU */
     for (j = 0; j < num_of_scu_ver; j++) {
         for (i = 0; i < num_of_scu_hor; i++) {
-            lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR);
+            if (h->param->input_sample_bit_depth == 8) {
+            lf_scu_deblock8(h, frm->planes8, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR);
+            } else {
+            lf_scu_deblock10(h, frm->planes10, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR);
+            }
         }
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_deblock_init(uint32_t cpuid, intrinsic_func_t* lf)
+void xavs2_deblock_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t* lf)
 {
-    lf->deblock_luma  [0] = deblock_edge_ver;
-    lf->deblock_luma  [1] = deblock_edge_hor;
-    lf->deblock_chroma[0] = deblock_edge_ver_c;
-    lf->deblock_chroma[1] = deblock_edge_hor_c;
+    if (param->input_sample_bit_depth == 8) {
+    lf->deblock_luma8  [0] = deblock_edge_ver8;
+    lf->deblock_luma8  [1] = deblock_edge_hor8;
+    lf->deblock_chroma8[0] = deblock_edge_ver8_c;
+    lf->deblock_chroma8[1] = deblock_edge_hor8_c;
 
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_SSE42) {
-        lf->deblock_luma[0] = deblock_edge_ver_sse128;
-        lf->deblock_luma[1] = deblock_edge_hor_sse128;
-        // lf->deblock_chroma[0] = deblock_edge_ver_c_sse128;
-        // lf->deblock_chroma[1] = deblock_edge_hor_c_sse128;
+        lf->deblock_luma8[0] = deblock_edge_ver_sse128;
+        lf->deblock_luma8[1] = deblock_edge_hor_sse128;
+        // lf->deblock_chroma8[0] = deblock_edge_ver_c_sse128;
+        // lf->deblock_chroma8[1] = deblock_edge_hor_c_sse128;
     }
+
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
         // In some machines, avx is slower than SSE
-        // lf->deblock_luma[0]   = deblock_edge_ver_avx2;
-        // lf->deblock_luma[1]   = deblock_edge_hor_avx2;
-        // lf->deblock_chroma[0] = deblock_edge_ver_c_avx2;
-        // lf->deblock_chroma[1] = deblock_edge_hor_c_avx2;
+        // lf->deblock_luma8[0]   = deblock_edge_ver_avx2;
+        // lf->deblock_luma8[1]   = deblock_edge_hor_avx2;
+        // lf->deblock_chroma8[0] = deblock_edge_ver_c_avx2;
+        // lf->deblock_chroma8[1] = deblock_edge_hor_c_avx2;
     }
+#endif
 #else
     UNUSED_PARAMETER(cpuid);
 #endif
+    } else {
+    lf->deblock_luma10  [0] = deblock_edge_ver10;
+    lf->deblock_luma10  [1] = deblock_edge_hor10;
+    lf->deblock_chroma10[0] = deblock_edge_ver10_c;
+    lf->deblock_chroma10[1] = deblock_edge_hor10_c;
+
+#if HAVE_MMX
+    if (cpuid & XAVS2_CPU_SSE42) {
+        lf->deblock_luma10[0] = deblock_edge_ver_sse128;
+        lf->deblock_luma10[1] = deblock_edge_hor_sse128;
+        // lf->deblock_chroma10[0] = deblock_edge_ver_c_sse128;
+        // lf->deblock_chroma10[1] = deblock_edge_hor_c_sse128;
+    }
+
+#if defined(__AVX2__)
+    if (cpuid & XAVS2_CPU_AVX2) {
+        // In some machines, avx is slower than SSE
+        // lf->deblock_luma10[0]   = deblock_edge_ver_avx2;
+        // lf->deblock_luma10[1]   = deblock_edge_hor_avx2;
+        // lf->deblock_chroma10[0] = deblock_edge_ver_c_avx2;
+        // lf->deblock_chroma10[1] = deblock_edge_hor_c_avx2;
+    }
+#endif
+#else
+    UNUSED_PARAMETER(cpuid);
+#endif
+    }
 }
diff --git a/source/common/filter_sao.c b/source/common/filter_sao.c
index d813ee9..022d745 100644
--- a/source/common/filter_sao.c
+++ b/source/common/filter_sao.c
@@ -48,14 +48,14 @@
 
 /* ---------------------------------------------------------------------------
 */
-static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+static void sao_block8_c(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
                         int i_block_w, int i_block_h,
                         int *lcu_avail, SAOBlkParam *sao_param)
 {
     int8_t SIGN_BUF[MAX_CU_SIZE + 32];  // sign of top line
     int8_t *UPROW_S = SIGN_BUF + 16;
     int  *sao_offset = sao_param->offset;
-    const int max_pel_val = (1 << g_bit_depth) - 1;
+    const int max_pel_val = (1 << h->param->input_sample_bit_depth) - 1;
     int reg = 0;
     int sx, sy, ex, ey;               // start/end (x, y)
     int sx_0, ex_0, sx_n, ex_n;       // start/end x for first and last row
@@ -75,7 +75,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                 right_sign = xavs2_sign3(p_src[x] - p_src[x + 1]);
                 edge_type = left_sign + right_sign + 2;
                 left_sign = -right_sign;
-                p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
             }
             p_src += i_src;
             p_dst += i_dst;
@@ -92,7 +92,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                 down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
                 edge_type = down_sign + top_sign + 2;
                 top_sign = -down_sign;
-                p_dst[y * i_dst + x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]);
+                p_dst[y * i_dst + x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]);
             }
         }
         break;
@@ -115,7 +115,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             pel_diff = p_src[x] - p_src[-i_src + x - 1];
             top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
             edge_type = top_sign - UPROW_S[x + 1] + 2;
-            p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
         }
 
         // middle rows
@@ -131,7 +131,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                 pel_diff = p_src[x] - p_src[i_src + x + 1];
                 down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
                 edge_type = down_sign + UPROW_S[x] + 2;
-                p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
                 UPROW_S[x] = (int8_t)reg;
                 reg = -down_sign;
             }
@@ -151,7 +151,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             pel_diff = p_src[x] - p_src[i_src + x + 1];
             down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
             edge_type = down_sign + UPROW_S[x] + 2;
-            p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
         }
         break;
     case SAO_TYPE_EO_45:
@@ -172,7 +172,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             pel_diff = p_src[x] - p_src[-i_src + x + 1];
             top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
             edge_type = top_sign - UPROW_S[x - 1] + 2;
-            p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
         }
 
         // middle rows
@@ -188,7 +188,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                 pel_diff = p_src[x] - p_src[i_src + x - 1];
                 down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
                 edge_type = down_sign + UPROW_S[x] + 2;
-                p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
                 UPROW_S[x - 1] = (int8_t)(-down_sign);
             }
         }
@@ -207,15 +207,197 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             pel_diff = p_src[x] - p_src[i_src + x - 1];
             down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
             edge_type = down_sign + UPROW_S[x] + 2;
-            p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
         }
         break;
     case SAO_TYPE_BO:
-        pel_diff = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
+        pel_diff = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
         for (y = 0; y < i_block_h; y++) {
             for (x = 0; x < i_block_w; x++) {
                 edge_type = p_src[x] >> pel_diff;
-                p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            }
+            p_src += i_src;
+            p_dst += i_dst;
+        }
+        break;
+    default:
+        xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported SAO types.");
+        assert(0);
+        exit(-1);
+    }
+}
+
+static void sao_block10_c(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
+                        int i_block_w, int i_block_h,
+                        int *lcu_avail, SAOBlkParam *sao_param)
+{
+    int8_t SIGN_BUF[MAX_CU_SIZE + 32];  // sign of top line
+    int8_t *UPROW_S = SIGN_BUF + 16;
+    int  *sao_offset = sao_param->offset;
+    const int max_pel_val = (1 << h->param->input_sample_bit_depth) - 1;
+    int reg = 0;
+    int sx, sy, ex, ey;               // start/end (x, y)
+    int sx_0, ex_0, sx_n, ex_n;       // start/end x for first and last row
+    int left_sign, right_sign, top_sign, down_sign;
+    int edge_type;
+    int pel_diff;
+    int x, y;
+
+    assert(sao_param->typeIdc != SAO_TYPE_OFF);
+    switch (sao_param->typeIdc) {
+    case SAO_TYPE_EO_0:
+        sx = lcu_avail[SAO_L] ? 0 : 1;
+        ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+        for (y = 0; y < i_block_h; y++) {
+            pel_diff = p_src[sx] - p_src[sx - 1];
+            left_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0);
+            for (x = sx; x < ex; x++) {
+                pel_diff = p_src[x] - p_src[x + 1];
+                right_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0);
+                edge_type = left_sign + right_sign + 2;
+                left_sign = -right_sign;
+                p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+            }
+            p_src += i_src;
+            p_dst += i_dst;
+        }
+        break;
+    case SAO_TYPE_EO_90: {
+        sy = lcu_avail[SAO_T] ? 0 : 1;
+        ey = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1);
+        for (x = 0; x < i_block_w; x++) {
+            pel_diff = p_src[sy * i_src + x] - p_src[(sy - 1) * i_src + x];
+            top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            for (y = sy; y < ey; y++) {
+                pel_diff = p_src[y * i_src + x] - p_src[(y + 1) * i_src + x];
+                down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                edge_type = down_sign + top_sign + 2;
+                top_sign = -down_sign;
+                p_dst[y * i_dst + x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]);
+            }
+        }
+        break;
+    }
+    case SAO_TYPE_EO_135:
+        sx = lcu_avail[SAO_L] ? 0 : 1;
+        ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+
+        // init the line buffer
+        for (x = sx; x < ex; x++) {
+            pel_diff = p_src[i_src + x + 1] - p_src[x];
+            top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            UPROW_S[x + 1] = (int8_t)top_sign;
+        }
+
+        // first row
+        sx_0 = lcu_avail[SAO_TL] ? 0 : 1;
+        ex_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1;
+        for (x = sx_0; x < ex_0; x++) {
+            pel_diff = p_src[x] - p_src[-i_src + x - 1];
+            top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            edge_type = top_sign - UPROW_S[x + 1] + 2;
+            p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+        }
+
+        // middle rows
+        for (y = 1; y < i_block_h - 1; y++) {
+            p_src += i_src;
+            p_dst += i_dst;
+            for (x = sx; x < ex; x++) {
+                if (x == sx) {
+                    pel_diff = p_src[x] - p_src[-i_src + x - 1];
+                    top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                    UPROW_S[x] = (int8_t)top_sign;
+                }
+                pel_diff = p_src[x] - p_src[i_src + x + 1];
+                down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                edge_type = down_sign + UPROW_S[x] + 2;
+                p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                UPROW_S[x] = (int8_t)reg;
+                reg = -down_sign;
+            }
+        }
+
+        // last row
+        sx_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1);
+        ex_n = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1);
+        p_src += i_src;
+        p_dst += i_dst;
+        for (x = sx_n; x < ex_n; x++) {
+            if (x == sx) {
+                pel_diff = p_src[x] - p_src[-i_src + x - 1];
+                top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                UPROW_S[x] = (int8_t)top_sign;
+            }
+            pel_diff = p_src[x] - p_src[i_src + x + 1];
+            down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            edge_type = down_sign + UPROW_S[x] + 2;
+            p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+        }
+        break;
+    case SAO_TYPE_EO_45:
+        sx = lcu_avail[SAO_L] ? 0 : 1;
+        ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+
+        // init the line buffer
+        for (x = sx; x < ex; x++) {
+            pel_diff = p_src[i_src + x - 1] - p_src[x];
+            top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            UPROW_S[x - 1] = (int8_t)top_sign;
+        }
+
+        // first row
+        sx_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1);
+        ex_0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1);
+        for (x = sx_0; x < ex_0; x++) {
+            pel_diff = p_src[x] - p_src[-i_src + x + 1];
+            top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            edge_type = top_sign - UPROW_S[x - 1] + 2;
+            p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+        }
+
+        // middle rows
+        for (y = 1; y < i_block_h - 1; y++) {
+            p_src += i_src;
+            p_dst += i_dst;
+            for (x = sx; x < ex; x++) {
+                if (x == ex - 1) {
+                    pel_diff = p_src[x] - p_src[-i_src + x + 1];
+                    top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                    UPROW_S[x] = (int8_t)top_sign;
+                }
+                pel_diff = p_src[x] - p_src[i_src + x - 1];
+                down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                edge_type = down_sign + UPROW_S[x] + 2;
+                p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+                UPROW_S[x - 1] = (int8_t)(-down_sign);
+            }
+        }
+
+        // last row
+        sx_n = lcu_avail[SAO_DL] ? 0 : 1;
+        ex_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1;
+        p_src += i_src;
+        p_dst += i_dst;
+        for (x = sx_n; x < ex_n; x++) {
+            if (x == ex - 1) {
+                pel_diff = p_src[x] - p_src[-i_src + x + 1];
+                top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+                UPROW_S[x] = (int8_t)top_sign;
+            }
+            pel_diff = p_src[x] - p_src[i_src + x - 1];
+            down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0);
+            edge_type = down_sign + UPROW_S[x] + 2;
+            p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
+        }
+        break;
+    case SAO_TYPE_BO:
+        pel_diff = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
+        for (y = 0; y < i_block_h; y++) {
+            for (x = 0; x < i_block_w; x++) {
+                edge_type = p_src[x] >> pel_diff;
+                p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]);
             }
             p_src += i_src;
             p_dst += i_dst;
@@ -230,17 +412,31 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_sao_init(uint32_t cpuid, intrinsic_func_t *pf)
+void xavs2_sao_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf)
 {
-    pf->sao_block = sao_block_c;
+    if (param->input_sample_bit_depth == 8) {
+    pf->sao_block8 = sao_block8_c;
+#if HAVE_MMX
+    if (cpuid & XAVS2_CPU_SSE4) {
+        pf->sao_block8 = SAO_on_block_sse128;
+    }
+#ifdef _MSC_VER
+    if (cpuid & XAVS2_CPU_AVX2) {
+        pf->sao_block8 = SAO_on_block_sse256;
+    }
+#endif // if _MSC_VER
+#endif // HAVE_MMX
+    } else {
+    pf->sao_block10 = sao_block10_c;
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_SSE4) {
-        pf->sao_block = SAO_on_block_sse128;
+        pf->sao_block10 = SAO_on_block_sse128;
     }
 #ifdef _MSC_VER
     if (cpuid & XAVS2_CPU_AVX2) {
-        pf->sao_block = SAO_on_block_sse256;
+        pf->sao_block10 = SAO_on_block_sse256;
     }
 #endif // if _MSC_VER
 #endif // HAVE_MMX
+    }
 }
diff --git a/source/common/frame.c b/source/common/frame.c
index 976ecc7..544cd8e 100644
--- a/source/common/frame.c
+++ b/source/common/frame.c
@@ -147,11 +147,12 @@ size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type)
     }
 
     /* compute space size and alloc memory */
+    if (param->input_sample_bit_depth == 8) {
     mem_size = sizeof(xavs2_frame_t)                + /* M0, size of frame handle */
                i_nal_info_size                             + /* M1, size of nal_info buffer */
                cmp_size + cmp_buf_size                     + /* M2, size of frame complexity buffer */
                bs_size                                     + /* M3, size of bitstream buffer */
-               planes_size * sizeof(pel_t)                 + /* M4, size of planes buffer: Y+U+V */
+               planes_size * sizeof(pel8_t)                 + /* M4, size of planes buffer: Y+U+V */
                frame_size_in_mvstore * sizeof(int8_t)      + /* M5, size of pu reference index buffer */
                frame_size_in_mvstore * sizeof(mv_t)        + /* M6, size of pu motion vector buffer */
 #if SAVE_CU_INFO
@@ -159,11 +160,28 @@ size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type)
 #endif
                (img_h_l >> MIN_CU_SIZE_IN_BIT) * sizeof(int)+ /* M8, line status array */
                CACHE_LINE_SIZE * 10;
+    /* align to CACHE_LINE_SIZE */
+    mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1));
 
+    return mem_size;
+    } else {
+    mem_size = sizeof(xavs2_frame_t)                + /* M0, size of frame handle */
+               i_nal_info_size                             + /* M1, size of nal_info buffer */
+               cmp_size + cmp_buf_size                     + /* M2, size of frame complexity buffer */
+               bs_size                                     + /* M3, size of bitstream buffer */
+               planes_size * sizeof(pel10_t)                 + /* M4, size of planes buffer: Y+U+V */
+               frame_size_in_mvstore * sizeof(int8_t)      + /* M5, size of pu reference index buffer */
+               frame_size_in_mvstore * sizeof(mv_t)        + /* M6, size of pu motion vector buffer */
+#if SAVE_CU_INFO
+               frame_size_in_mincu * sizeof(int8_t) * 3    + /* M7, size of cu mode/cbp/level buffers */
+#endif
+               (img_h_l >> MIN_CU_SIZE_IN_BIT) * sizeof(int)+ /* M8, line status array */
+               CACHE_LINE_SIZE * 10;
     /* align to CACHE_LINE_SIZE */
     mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1));
 
     return mem_size;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -189,7 +207,6 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
     int frame_size_in_mincu = 0;
 #endif
     int frame_size_in_mvstore = 0;  /* reference information size */
-    uint8_t *mem_ptr;
 
     /* compute stride and the plane size */
     switch (alloc_type) {
@@ -236,11 +253,13 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
     }
 
     /* compute space size and alloc memory */
+    if (h->param->input_sample_bit_depth == 8) {
+    uint8_t *mem_ptr;
     mem_size = sizeof(xavs2_frame_t)                + /* M0, size of frame handle */
                i_nal_info_size                             + /* M1, size of nal_info buffer */
                cmp_size + cmp_buf_size                     + /* M2, size of frame complexity buffer */
                bs_size                                     + /* M3, size of bitstream buffer */
-               planes_size * sizeof(pel_t)                 + /* M4, size of planes buffer: Y+U+V */
+               planes_size * sizeof(pel8_t)                 + /* M4, size of planes buffer: Y+U+V */
                frame_size_in_mvstore * sizeof(int8_t)      + /* M5, size of pu reference index buffer */
                frame_size_in_mvstore * sizeof(mv_t)        + /* M6, size of pu motion vector buffer */
 #if SAVE_CU_INFO
@@ -248,12 +267,11 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
 #endif
                h->i_height_in_lcu * sizeof(int)            + /* M8, line status array */
                CACHE_LINE_SIZE * 10;
-
     /* align to CACHE_LINE_SIZE */
     mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1));
 
     if (mem_base == NULL) {
-        CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size);
+        CHECKED_MALLOC8(mem_ptr, uint8_t *, mem_size);
     } else {
         mem_ptr = *mem_base;
     }
@@ -305,54 +323,54 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
     }
 
     /* M3, buffer for planes: Y+U+V */
-    frame->plane_buf = (pel_t *)mem_ptr;
-    frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel_t);
+    frame->plane_buf8 = (pel8_t *)mem_ptr;
+    frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel8_t);
 
-    frame->planes[0] = (pel_t *)mem_ptr;
-    frame->planes[1] = frame->planes[0] + size_l;
-    frame->planes[2] = frame->planes[1] + size_c;
-    mem_ptr         += (size_l + size_c * 2) * sizeof(pel_t);
+    frame->planes8[0] = (pel8_t *)mem_ptr;
+    frame->planes8[1] = frame->planes8[0] + size_l;
+    frame->planes8[2] = frame->planes8[1] + size_c;
+    mem_ptr         += (size_l + size_c * 2) * sizeof(pel8_t);
 
     if (alloc_type == FT_DEC || alloc_type == FT_TEMP) {
         uint8_t *p_align;
         /* point to plane data area */
-        frame->planes[0] += frame->i_stride[0] * (XAVS2_PAD    ) + (XAVS2_PAD    );
-        frame->planes[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
-        frame->planes[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
+        frame->planes8[0] += frame->i_stride[0] * (XAVS2_PAD    ) + (XAVS2_PAD    );
+        frame->planes8[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
+        frame->planes8[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
 
         /* make sure the pointers are aligned */
-        p_align = (uint8_t *)frame->planes[0];
+        p_align = (uint8_t *)frame->planes8[0];
         ALIGN_POINTER(p_align);
-        frame->planes[0] = (pel_t *)p_align;
-        p_align = (uint8_t *)frame->planes[1];
+        frame->planes8[0] = (pel8_t *)p_align;
+        p_align = (uint8_t *)frame->planes8[1];
         ALIGN_POINTER(p_align);
-        frame->planes[1] = (pel_t *)p_align;
-        p_align = (uint8_t *)frame->planes[2];
+        frame->planes8[1] = (pel8_t *)p_align;
+        p_align = (uint8_t *)frame->planes8[2];
         ALIGN_POINTER(p_align);
-        frame->planes[2] = (pel_t *)p_align;
+        frame->planes8[2] = (pel8_t *)p_align;
     }
 
     if (alloc_type == FT_DEC) {
         /* buffer for luma interpolated planes */
-        frame->filtered[0] = frame->planes[0];  // full pel plane, reused
+        frame->filtered8[0] = frame->planes8[0];  // full pel plane, reused
         for (i = 1; i < 16; i++) {
-            frame->filtered[i] = NULL;
+            frame->filtered8[i] = NULL;
         }
 #if ENABLE_FRAME_SUBPEL_INTPL
         switch (h->use_fractional_me) {
         case 1:
-            frame->filtered[2]  = (pel_t *)mem_ptr;
-            mem_ptr            += size_l * sizeof(pel_t);
-            frame->filtered[8]  = (pel_t *)mem_ptr;
-            mem_ptr            += size_l * sizeof(pel_t);
-            frame->filtered[10] = (pel_t *)mem_ptr;
-            mem_ptr            += size_l * sizeof(pel_t);
+            frame->filtered8[2]  = (pel8_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel8_t);
+            frame->filtered8[8]  = (pel8_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel8_t);
+            frame->filtered8[10] = (pel8_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel8_t);
 
             break;
         case 2:
             for (i = 1; i < 16; i++) {
-                frame->filtered[i] = (pel_t *)mem_ptr;
-                mem_ptr           += size_l * sizeof(pel_t);
+                frame->filtered8[i] = (pel8_t *)mem_ptr;
+                mem_ptr           += size_l * sizeof(pel8_t);
             }
             break;
         default:
@@ -361,8 +379,8 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
 #endif
         /* point to plane data area */
         for (i = 1; i < 16; i++) {
-            if (frame->filtered[i] != NULL) {
-                frame->filtered[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD;
+            if (frame->filtered8[i] != NULL) {
+                frame->filtered8[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD;
             }
         }
         ALIGN_POINTER(mem_ptr);
@@ -400,7 +418,7 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
 
     if (mem_ptr - (uint8_t *)frame > mem_size) {
         xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to alloc one frame, type %d\n", alloc_type);
-        goto fail;
+        goto fail8;
     }
 
     /* update mem_base */
@@ -414,17 +432,208 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type)
 
     /* initialize signals */
     if (xavs2_thread_mutex_init(&frame->mutex, NULL)) {
-        goto fail;
+        goto fail8;
+    }
+    if (xavs2_thread_cond_init(&frame->cond, NULL)) {
+        goto fail8;
+    }
+
+    return frame;
+
+fail8:
+    xavs2_free(mem_ptr);
+    return NULL;
+    } else {
+    uint8_t *mem_ptr;
+    mem_size = sizeof(xavs2_frame_t)                + /* M0, size of frame handle */
+               i_nal_info_size                             + /* M1, size of nal_info buffer */
+               cmp_size + cmp_buf_size                     + /* M2, size of frame complexity buffer */
+               bs_size                                     + /* M3, size of bitstream buffer */
+               planes_size * sizeof(pel10_t)                 + /* M4, size of planes buffer: Y+U+V */
+               frame_size_in_mvstore * sizeof(int8_t)      + /* M5, size of pu reference index buffer */
+               frame_size_in_mvstore * sizeof(mv_t)        + /* M6, size of pu motion vector buffer */
+#if SAVE_CU_INFO
+               frame_size_in_mincu * sizeof(int8_t) * 3    + /* M7, size of cu mode/cbp/level buffers */
+#endif
+               h->i_height_in_lcu * sizeof(int)            + /* M8, line status array */
+               CACHE_LINE_SIZE * 10;
+    /* align to CACHE_LINE_SIZE */
+    mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1));
+
+    if (mem_base == NULL) {
+        CHECKED_MALLOC10(mem_ptr, uint8_t *, mem_size);
+    } else {
+        mem_ptr = /*(uint16_t*)*/*mem_base;
+    }
+
+    /* M0, frame handle */
+    frame    = (xavs2_frame_t *)mem_ptr;
+    mem_ptr += sizeof(xavs2_frame_t);
+    ALIGN_POINTER(mem_ptr);
+
+    /* set frame properties */
+    frame->i_plane     = 3;           /* planes: Y+U+V */
+    frame->i_width [0] = img_w_l;
+    frame->i_lines [0] = img_h_l;
+    frame->i_stride[0] = stride_l;
+    frame->i_width [1] = frame->i_width [2] = img_w_c;
+    frame->i_lines [1] = frame->i_lines [2] = img_h_c;
+    frame->i_stride[1] = frame->i_stride[2] = stride_c;
+
+    /* the default setting of a frame */
+    frame->i_frame   = -1;
+    frame->i_frm_coi = -1;
+    frame->i_gop_idr_coi = -1;
+
+    if (h->param->chroma_format == CHROMA_400) {
+        frame->i_plane = 1;
+    }
+
+    frame->i_frm_type = XAVS2_TYPE_AUTO;
+    frame->i_pts  = -1;
+    frame->i_dts  = -1;
+    frame->b_enable_intra = (h->param->enable_intra);
+
+    /* buffer for fenc */
+    if (alloc_type == FT_ENC) {
+#if XAVS2_ADAPT_LAYER
+        /* M1, nal_info buffer */
+        frame->nal_info = (xavs2_nal_info_t *)mem_ptr;
+        frame->i_nal    = 0;
+        mem_ptr        += i_nal_info_size;
+        ALIGN_POINTER(mem_ptr);
+#endif
+
+        /* M2, set the bit stream buffer pointer and length
+         * NOTE: the size of bitstream buffer is big enough, no need to reallocate
+         *       memory in function encoder_encapsulate_nals */
+        frame->p_bs_buf = mem_ptr;
+        frame->i_bs_buf = bs_size;     /* the length is long enough */
+        mem_ptr        += bs_size;
+    }
+
+    /* M3, buffer for planes: Y+U+V */
+    frame->plane_buf10 = (pel10_t *)mem_ptr;
+    frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel10_t);
+
+    frame->planes10[0] = (pel10_t *)mem_ptr;
+    frame->planes10[1] = frame->planes10[0] + size_l;
+    frame->planes10[2] = frame->planes10[1] + size_c;
+    mem_ptr         += (size_l + size_c * 2) * sizeof(pel10_t);
+
+    if (alloc_type == FT_DEC || alloc_type == FT_TEMP) {
+        uint8_t *p_align;
+        /* point to plane data area */
+        frame->planes10[0] += frame->i_stride[0] * (XAVS2_PAD    ) + (XAVS2_PAD    );
+        frame->planes10[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
+        frame->planes10[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2);
+
+        /* make sure the pointers are aligned */
+        p_align = (uint8_t *)frame->planes10[0];
+        ALIGN_POINTER(p_align);
+        frame->planes10[0] = (pel10_t *)p_align;
+        p_align = (uint8_t *)frame->planes10[1];
+        ALIGN_POINTER(p_align);
+        frame->planes10[1] = (pel10_t *)p_align;
+        p_align = (uint8_t *)frame->planes10[2];
+        ALIGN_POINTER(p_align);
+        frame->planes10[2] = (pel10_t *)p_align;
+    }
+
+    if (alloc_type == FT_DEC) {
+        /* buffer for luma interpolated planes */
+        frame->filtered10[0] = frame->planes10[0];  // full pel plane, reused
+        for (i = 1; i < 16; i++) {
+            frame->filtered10[i] = NULL;
+        }
+#if ENABLE_FRAME_SUBPEL_INTPL
+        switch (h->use_fractional_me) {
+        case 1:
+            frame->filtered10[2]  = (pel10_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel10_t);
+            frame->filtered10[8]  = (pel10_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel10_t);
+            frame->filtered10[10] = (pel10_t *)mem_ptr;
+            mem_ptr            += size_l * sizeof(pel10_t);
+
+            break;
+        case 2:
+            for (i = 1; i < 16; i++) {
+                frame->filtered10[i] = (pel10_t *)mem_ptr;
+                mem_ptr           += size_l * sizeof(pel10_t);
+            }
+            break;
+        default:
+            break;
+        }
+#endif
+        /* point to plane data area */
+        for (i = 1; i < 16; i++) {
+            if (frame->filtered10[i] != NULL) {
+                frame->filtered10[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD;
+            }
+        }
+        ALIGN_POINTER(mem_ptr);
+
+        /* M4, reference index buffer */
+        frame->pu_ref = (int8_t *)mem_ptr;
+        mem_ptr      += frame_size_in_mvstore * sizeof(int8_t);
+        ALIGN_POINTER(mem_ptr);
+
+        /* M5, pu motion vector buffer */
+        frame->pu_mv  = (mv_t *)mem_ptr;
+        mem_ptr += frame_size_in_mvstore * sizeof(mv_t);
+        ALIGN_POINTER(mem_ptr);
+
+#if SAVE_CU_INFO
+        /* M6, cu mode/cbp/level buffers */
+        frame->cu_mode  = (int8_t *)mem_ptr;
+        mem_ptr        += frame_size_in_mincu * sizeof(int8_t);
+        ALIGN_POINTER(mem_ptr);
+        frame->cu_cbp   = (int8_t *)mem_ptr;
+        mem_ptr        += frame_size_in_mincu * sizeof(int8_t);
+        ALIGN_POINTER(mem_ptr);
+        frame->cu_level = (int8_t *)mem_ptr;
+        mem_ptr        += frame_size_in_mincu * sizeof(int8_t);
+        ALIGN_POINTER(mem_ptr);
+#endif
+
+        /* M7, line status array */
+        frame->num_lcu_coded_in_row = (int *)mem_ptr;
+        mem_ptr                    += h->i_height_in_lcu * sizeof(int);
+        ALIGN_POINTER(mem_ptr);
+
+        memset(frame->num_lcu_sao_off, 0, sizeof(frame->num_lcu_sao_off));
+    }
+
+    if (mem_ptr - (uint8_t *)frame > mem_size) {
+        xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to alloc one frame, type %d\n", alloc_type);
+        goto fail10;
+    }
+
+    /* update mem_base */
+    if (mem_base != NULL) {
+        *mem_base = /*(uint8_t**)*/mem_ptr;
+    }
+
+    /* initialize default value */
+    frame->i_qpplus1     = 0;
+    frame->cnt_refered   = 0;
+
+    /* initialize signals */
+    if (xavs2_thread_mutex_init(&frame->mutex, NULL)) {
+        goto fail10;
     }
     if (xavs2_thread_cond_init(&frame->cond, NULL)) {
-        goto fail;
+        goto fail10;
     }
 
     return frame;
 
-fail:
+fail10:
     xavs2_free(mem_ptr);
     return NULL;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -468,11 +677,50 @@ void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame)
 /* ---------------------------------------------------------------------------
  */
 void
-plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height,
+plane_expand_border8(pel8_t *p_pix, int i_stride, int i_width, int i_height,
+                    int i_padh, int i_padv, int b_pad_top, int b_pad_bottom)
+{
+    pel8_t *pix = p_pix;
+    pel8_t *row;
+    int y;
+
+    /* --- horizontal ----------------------------------------------
+     */
+    for (y = 0; y < i_height; y++) {
+        g_funcs.mem_repeat_p(pix - i_padh,  pix[0          ], i_padh);    /* left  band */
+        g_funcs.mem_repeat_p(pix + i_width, pix[i_width - 1], i_padh);    /* right band */
+        pix += i_stride;
+    }
+
+    /* --- vertical ------------------------------------------------
+     */
+    i_width += (i_padh << 1);
+
+    /* upper band */
+    if (b_pad_top) {
+        pix = row = p_pix - i_padh;   /* start row position */
+        for (y = 0; y < i_padv; y++) {
+            pix -= i_stride;
+            memcpy(pix, row, i_width * sizeof(pel8_t));
+        }
+    }
+
+    /* lower band */
+    if (b_pad_bottom) {
+        pix = row = p_pix + (i_height - 1) * i_stride - i_padh;
+        for (y = 0; y < i_padv; y++) {
+            pix += i_stride;
+            memcpy(pix, row, i_width * sizeof(pel8_t));
+        }
+    }
+}
+
+void
+plane_expand_border10(pel10_t *p_pix, int i_stride, int i_width, int i_height,
                     int i_padh, int i_padv, int b_pad_top, int b_pad_bottom)
 {
-    pel_t *pix = p_pix;
-    pel_t *row;
+    pel10_t *pix = p_pix;
+    pel10_t *row;
     int y;
 
     /* --- horizontal ----------------------------------------------
@@ -492,7 +740,7 @@ plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height,
         pix = row = p_pix - i_padh;   /* start row position */
         for (y = 0; y < i_padv; y++) {
             pix -= i_stride;
-            memcpy(pix, row, i_width * sizeof(pel_t));
+            memcpy(pix, row, i_width * sizeof(pel10_t));
         }
     }
 
@@ -501,7 +749,7 @@ plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height,
         pix = row = p_pix + (i_height - 1) * i_stride - i_padh;
         for (y = 0; y < i_padv; y++) {
             pix += i_stride;
-            memcpy(pix, row, i_width * sizeof(pel_t));
+            memcpy(pix, row, i_width * sizeof(pel10_t));
         }
     }
 }
@@ -515,9 +763,10 @@ void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame)
     int b_frame_start = 1;
     int b_frame_end   = 1;
     int i;
-    pel_t *pix;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *pix;
 
-    UNUSED_PARAMETER(h);
+    //UNUSED_PARAMETER(h);
 
     for (i = 0; i < frame->i_plane; i++) {
         int chroma = !!i;
@@ -527,8 +776,27 @@ void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame)
         int pad_h  = XAVS2_PAD >> chroma;
         int pad_v  = XAVS2_PAD >> chroma;
 
-        pix = frame->planes[i] + (slice_start_y >> chroma) * stride;
-        plane_expand_border(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end);
+        pix = frame->planes8[i] + (slice_start_y >> chroma) * stride;
+
+        plane_expand_border8(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end);
+    }
+    } else {
+    pel10_t *pix;
+
+    //UNUSED_PARAMETER(h);
+
+    for (i = 0; i < frame->i_plane; i++) {
+        int chroma = !!i;
+        int stride = frame->i_stride[i];
+        int width  = frame->i_width[i];
+        int height = slice_height >> chroma;
+        int pad_h  = XAVS2_PAD >> chroma;
+        int pad_v  = XAVS2_PAD >> chroma;
+
+        pix = frame->planes10[i] + (slice_start_y >> chroma) * stride;
+
+        plane_expand_border10(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end);
+    }
     }
 }
 
@@ -553,7 +821,8 @@ void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lc
         int y_start = ((i_lcu_y + 0) << (i_lcu_level - chroma_shift));
         int y_end   = ((i_lcu_y + 1) << (i_lcu_level - chroma_shift));
         int height;
-        pel_t *pix;
+        if (h->param->input_sample_bit_depth == 8) {
+        pel8_t *pix;
 
         if (i_lcu_y != h->slices[h->i_slice_index]->i_first_lcu_y) {
             y_start -= UP_SHIFT;
@@ -569,8 +838,30 @@ void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lc
         //               h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end);
         // }
 
-        pix = frame->planes[i] + y_start * stride;
-        plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end);
+        pix = frame->planes8[i] + y_start * stride;
+
+        plane_expand_border8(pix, stride, width, height, padh, padv, b_start, b_end);
+        } else {
+        pel10_t *pix;
+
+        if (i_lcu_y != h->slices[h->i_slice_index]->i_first_lcu_y) {
+            y_start -= UP_SHIFT;
+        }
+        if (i_lcu_y != h->slices[h->i_slice_index]->i_last_lcu_y) {
+            y_end -= UP_SHIFT;
+        }
+
+        y_end = XAVS2_MIN(frame->i_lines[i], y_end);
+        height = y_end - y_start;
+        // if (i == 0) {
+        //     xavs2_log(NULL, XAVS2_LOG_DEBUG, "Pad   POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n",
+        //               h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end);
+        // }
+
+        pix = frame->planes10[i] + y_start * stride;
+
+        plane_expand_border10(pix, stride, width, height, padh, padv, b_start, b_end);
+        }
     }
 }
 
@@ -588,9 +879,29 @@ void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame)
         int i_pady   = (h->i_height - h->param->org_height) >> i_scale;
         int i_stride = frame->i_stride[i];
 
+        if (h->param->input_sample_bit_depth == 8) {
         /* expand right border */
         if (i_padx) {
-            pel_t *pix = frame->planes[i] + i_width;
+            pel8_t *pix = frame->planes8[i] + i_width;
+            for (y = 0; y < i_height; y++) {
+                memset(pix, pix[-1], i_padx);
+                pix += i_stride;
+            }
+        }
+
+        /* expand bottom border */
+        if (i_pady) {
+            int rowlen = (i_width + i_padx) * sizeof(pel8_t);
+            pel8_t *row = frame->planes8[i] + (i_height - 1) * i_stride;
+            pel8_t *pix = frame->planes8[i] + (i_height    ) * i_stride;
+            for (y = i_height; y < i_height + i_pady; y++) {
+                memcpy(pix, row, rowlen);
+                pix += i_stride;
+            }
+        }
+        } else {
+        if (i_padx) {
+            pel10_t *pix = frame->planes10[i] + i_width;
             for (y = 0; y < i_height; y++) {
                 memset(pix, pix[-1], i_padx);
                 pix += i_stride;
@@ -599,32 +910,43 @@ void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame)
 
         /* expand bottom border */
         if (i_pady) {
-            int rowlen = (i_width + i_padx) * sizeof(pel_t);
-            pel_t *row = frame->planes[i] + (i_height - 1) * i_stride;
-            pel_t *pix = frame->planes[i] + (i_height    ) * i_stride;
+            int rowlen = (i_width + i_padx) * sizeof(pel10_t);
+            pel10_t *row = frame->planes10[i] + (i_height - 1) * i_stride;
+            pel10_t *pix = frame->planes10[i] + (i_height    ) * i_stride;
             for (y = i_height; y < i_height + i_pady; y++) {
                 memcpy(pix, row, rowlen);
                 pix += i_stride;
             }
         }
+        }
     }
 }
 
 /* ---------------------------------------------------------------------------
- * FIXME: »¹ÐèÒª¿¼ÂÇpaddingÇøÓòµÄ¿½±´
+ * FIXME: è¿˜éœ€è¦è€ƒè™‘paddingåŒºåŸŸçš„æ‹·è´
  */
 void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src)
 {
     int k;
 
-    UNUSED_PARAMETER(h);
+    //UNUSED_PARAMETER(h);
     if (dst->size_plane_buf == src->size_plane_buf && dst->i_width[0] == src->i_width[0]) {
-        g_funcs.fast_memcpy(dst->plane_buf, src->plane_buf, src->size_plane_buf);
+        if (h->param->input_sample_bit_depth == 8) {
+        g_funcs.fast_memcpy(dst->plane_buf8, src->plane_buf8, src->size_plane_buf);
+        } else {
+        g_funcs.fast_memcpy(dst->plane_buf10, src->plane_buf10, src->size_plane_buf);
+        }
     } else {
         for (k = 0; k < dst->i_plane; k++) {
-            g_funcs.plane_copy(dst->planes[k], dst->i_stride[k],
-                               src->planes[k], src->i_stride[k],
+            if (h->param->input_sample_bit_depth == 8) {
+            g_funcs.plane_copy8(h, dst->planes8[k], dst->i_stride[k],
+                               src->planes8[k], src->i_stride[k],
                                src->i_width[k], src->i_lines[k]);
+            } else {
+            g_funcs.plane_copy10(h, dst->planes10[k], dst->i_stride[k],
+                               src->planes10[k], src->i_stride[k],
+                               src->i_width[k], src->i_lines[k]);
+            }
         }
     }
 }
diff --git a/source/common/frame.h b/source/common/frame.h
index c7a1f15..376cb81 100644
--- a/source/common/frame.h
+++ b/source/common/frame.h
@@ -58,7 +58,9 @@ void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame);
 void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src);
 
 #define xavs2_frame_expand_border_frame FPFX(frame_expand_border_frame)
-void plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height,
+void plane_expand_border8(pel8_t *p_pix, int i_stride, int i_width, int i_height,
+                         int i_padh, int i_padv, int b_pad_top, int b_pad_bottom);
+void plane_expand_border10(pel10_t *p_pix, int i_stride, int i_width, int i_height,
                          int i_padh, int i_padv, int b_pad_top, int b_pad_bottom);
 void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame);
 #define xavs2_frame_expand_border_lcurow FPFX(frame_expand_border_lcurow)
diff --git a/source/common/intra.c b/source/common/intra.c
index 76279fb..f27f95e 100644
--- a/source/common/intra.c
+++ b/source/common/intra.c
@@ -124,22 +124,43 @@ static const char tab_auc_dir_dxdy[2][NUM_INTRA_MODE][2] = {
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ver_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ver8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    pel_t *p_src = src + 1;
+    pel8_t *p_src = src + 1;
     int y;
 
     for (y = 0; y < bsy; y++) {
-        g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel8_t));
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ver10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    pel10_t *p_src = src + 1;
+    int y;
+
+    for (y = 0; y < bsy; y++) {
+        g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel10_t));
         dst += i_dst;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_hor8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    pel8_t *p_src = src - 1;
+
+    while (bsy-- != 0) {
+        g_funcs.mem_repeat_p(dst, *p_src--, bsx);
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_hor10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    pel_t *p_src = src - 1;
+    pel10_t *p_src = src - 1;
 
     while (bsy-- != 0) {
         g_funcs.mem_repeat_p(dst, *p_src--, bsx);
@@ -150,7 +171,44 @@ static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in
 /* ---------------------------------------------------------------------------
  * NOTE: dir_mode = (bAboveAvail << 8) + (bLeftAvail)
  */
-static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_dc8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int avail_top  = dir_mode >> 8;
+    int avail_left = dir_mode & 0xFF;
+    int sum_left   = 0;
+    int sum_above  = 0;
+    int dc_value;
+    int x, y;
+    pel8_t *p_src;
+
+    p_src = src - 1;
+    for (y = 0; y < bsy; y++) {
+        sum_left += p_src[-y];
+    }
+
+    p_src = src + 1;
+    for (x = 0; x < bsx; x++) {
+        sum_above += p_src[x];
+    }
+
+    if (avail_left && avail_top) {
+        x = bsx + bsy;
+        dc_value = ((sum_left + sum_above + (x >> 1)) * (512 / x)) >> 9;
+    } else if (avail_left) {
+        dc_value = (sum_left  + (bsy >> 1)) >> xavs2_log2u(bsy);
+    } else if (avail_top) {
+        dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx);
+    } else {
+        dc_value = ((1 << h->param->input_sample_bit_depth) >> 1);
+    }
+
+    for (y = 0; y < bsy; y++) {
+        g_funcs.mem_repeat_p(dst, (pel8_t)dc_value, bsx);
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_dc10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int avail_top  = dir_mode >> 8;
     int avail_left = dir_mode & 0xFF;
@@ -158,7 +216,7 @@ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int
     int sum_above  = 0;
     int dc_value;
     int x, y;
-    pel_t *p_src;
+    pel10_t *p_src;
 
     p_src = src - 1;
     for (y = 0; y < bsy; y++) {
@@ -178,18 +236,62 @@ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int
     } else if (avail_top) {
         dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx);
     } else {
-        dc_value = g_dc_value;
+        dc_value = ((1 << h->param->input_sample_bit_depth) >> 1);
     }
 
     for (y = 0; y < bsy; y++) {
-        g_funcs.mem_repeat_p(dst, (pel_t)dc_value, bsx);
+        g_funcs.mem_repeat_p(dst, (pel10_t)dc_value, bsx);
         dst += i_dst;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_plane8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    /*                 size in bits:       2   3   4   5   6 */
+    static const int ib_mult [8] = { 0, 0, 13, 17,  5, 11, 23, 0 };
+    static const int ib_shift[8] = { 0, 0,  7, 10, 11, 15, 19, 0 };
+    const int mult_h  = ib_mult [tab_log2size[bsx]];
+    const int mult_v  = ib_mult [tab_log2size[bsy]];
+    const int shift_h = ib_shift[tab_log2size[bsx]];
+    const int shift_v = ib_shift[tab_log2size[bsy]];
+    const int W2   = bsx >> 1;              /* half block width */
+    const int H2   = bsy >> 1;              /* half block height */
+    const int vmax = (1 << h->param->input_sample_bit_depth) - 1;  /* max value of pixel */
+    int H = 0;
+    int V = 0;
+    int a, b, c;
+    int x, y;
+    pel8_t *p_src;
+
+    /* calculate H and V */
+    p_src = src + W2;
+    for (x = 1; x < W2 + 1; x++) {
+        H += x * (p_src[x] - p_src[-x]);
+    }
+    p_src = src - H2;
+    for (y = 1; y < H2 + 1; y++) {
+        V += y * (p_src[-y] - p_src[y]);
+    }
+
+    a  = (src[-bsy] + src[bsx]) << 4;
+    b  = ((H << 5) * mult_h + (1 << (shift_h - 1))) >> shift_h;
+    c  = ((V << 5) * mult_v + (1 << (shift_v - 1))) >> shift_v;
+    a += 16 - b * (W2 - 1) - c * (H2 - 1);
+
+    for (y = 0; y < bsy; y++) {
+        int pix = a;
+        for (x = 0; x < bsx; x++) {
+            dst[x] = (pel8_t)XAVS2_CLIP3(0, vmax, pix >> 5);
+            pix   += b;
+        }
+        dst += i_dst;
+        a   += c;
+    }
+}
+
+static void intra_pred_plane10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     /*                 size in bits:       2   3   4   5   6 */
     static const int ib_mult [8] = { 0, 0, 13, 17,  5, 11, 23, 0 };
@@ -200,12 +302,12 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
     const int shift_v = ib_shift[tab_log2size[bsy]];
     const int W2   = bsx >> 1;              /* half block width */
     const int H2   = bsy >> 1;              /* half block height */
-    const int vmax = (1 << g_bit_depth) - 1;  /* max value of pixel */
+    const int vmax = (1 << h->param->input_sample_bit_depth) - 1;  /* max value of pixel */
     int H = 0;
     int V = 0;
     int a, b, c;
     int x, y;
-    pel_t *p_src;
+    pel10_t *p_src;
 
     /* calculate H and V */
     p_src = src + W2;
@@ -225,7 +327,7 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
     for (y = 0; y < bsy; y++) {
         int pix = a;
         for (x = 0; x < bsx; x++) {
-            dst[x] = (pel_t)XAVS2_CLIP3(0, vmax, pix >> 5);
+            dst[x] = (pel10_t)XAVS2_CLIP3(0, vmax, pix >> 5);
             pix   += b;
         }
         dst += i_dst;
@@ -235,19 +337,74 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_bilinear8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    itr8_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE];
+    int shift_x  = tab_log2size[bsx];
+    int shift_y  = tab_log2size[bsy];
+    int shift    = XAVS2_MIN(shift_x, shift_y);
+    int shift_xy = shift_x + shift_y + 1;
+    int offset   = 1 << (shift_x + shift_y);
+    int vmax     = (1 << h->param->input_sample_bit_depth) - 1;;    // max value of pixel
+    int a, b, c, t, wxy, temp;
+    int predx, val;
+    int x, y;
+    pel8_t *p_src;
+
+    p_src = src + 1;
+    for (x = 0; x < bsx; x++) {
+        pTop[x] = p_src[x];
+    }
+    p_src = src - 1;
+    for (y = 0; y < bsy; y++) {
+        pLeft[y] = p_src[-y];
+    }
+
+    a = pTop [bsx - 1];
+    b = pLeft[bsy - 1];
+    c = (bsx == bsy) ? (a + b + 1) >> 1 : (((a << shift_x) + (b << shift_y)) * 13 + (1 << (shift + 5))) >> (shift + 6);
+    t = (c << 1) - a - b;
+
+    for (x = 0; x < bsx; x++) {
+        pT  [x]   = (itr8_t)(b - pTop[x]);
+        pTop[x] <<= shift_y;
+    }
+
+    temp = 0;
+    for (y = 0; y < bsy; y++) {
+        pL   [y]   = (itr8_t)(a - pLeft[y]);
+        pLeft[y] <<= shift_x;
+        wy   [y]   = (itr8_t)temp;
+        temp      += t;
+    }
+
+    for (y = 0; y < bsy; y++) {
+        predx = pLeft[y];
+        wxy   = -wy[y];
+        for (x = 0; x < bsx; x++) {
+            predx   += pL[y];
+            wxy     += wy[y];
+            pTop[x] += pT[x];
+            val      = ((predx << shift_y) + (pTop[x] << shift_x) + wxy + offset) >> shift_xy;
+            dst[x]   = (pel8_t)XAVS2_CLIP3(0, vmax, val);
+        }
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_bilinear10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    itr_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE];
+    itr10_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE];
     int shift_x  = tab_log2size[bsx];
     int shift_y  = tab_log2size[bsy];
     int shift    = XAVS2_MIN(shift_x, shift_y);
     int shift_xy = shift_x + shift_y + 1;
     int offset   = 1 << (shift_x + shift_y);
-    int vmax     = max_pel_value;    // max value of pixel
+    int vmax     = (1 << h->param->input_sample_bit_depth) - 1;;    // max value of pixel
     int a, b, c, t, wxy, temp;
     int predx, val;
     int x, y;
-    pel_t *p_src;
+    pel10_t *p_src;
 
     p_src = src + 1;
     for (x = 0; x < bsx; x++) {
@@ -264,15 +421,15 @@ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
     t = (c << 1) - a - b;
 
     for (x = 0; x < bsx; x++) {
-        pT  [x]   = (itr_t)(b - pTop[x]);
+        pT  [x]   = (itr10_t)(b - pTop[x]);
         pTop[x] <<= shift_y;
     }
 
     temp = 0;
     for (y = 0; y < bsy; y++) {
-        pL   [y]   = (itr_t)(a - pLeft[y]);
+        pL   [y]   = (itr10_t)(a - pLeft[y]);
         pLeft[y] <<= shift_x;
-        wy   [y]   = (itr_t)temp;
+        wy   [y]   = (itr10_t)temp;
         temp      += t;
     }
 
@@ -284,7 +441,7 @@ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
             wxy     += wy[y];
             pTop[x] += pT[x];
             val      = ((predx << shift_y) + (pTop[x] << shift_x) + wxy + offset) >> shift_xy;
-            dst[x]   = (pel_t)XAVS2_CLIP3(0, vmax, val);
+            dst[x]   = (pel10_t)XAVS2_CLIP3(0, vmax, val);
         }
         dst += i_dst;
     }
@@ -307,7 +464,31 @@ static ALWAYS_INLINE int get_context_pixel(int dir_mode, int xy_flag, int temp_d
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int f0, f1, f2, f3;
+    int i, j;
+    int iX;
+
+    for (j = 0; j < bsy; j++) {
+        iX = get_context_pixel(dir_mode, 0, j + 1, &f3);
+        f0 = 32 - f3;
+        f1 = 64 - f3;
+        f2 = 32 + f3;
+
+        for (i = 0; i < bsx; i++) {
+            dst[i] = (pel8_t)((src[iX    ] * f0 +
+                              src[iX + 1] * f1 +
+                              src[iX + 2] * f2 +
+                              src[iX + 3] * f3 + 64) >> 7);
+            iX++;
+        }
+
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_x_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int f0, f1, f2, f3;
     int i, j;
@@ -320,7 +501,7 @@ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
         f2 = 32 + f3;
 
         for (i = 0; i < bsx; i++) {
-            dst[i] = (pel_t)((src[iX    ] * f0 +
+            dst[i] = (pel10_t)((src[iX    ] * f0 +
                               src[iX + 1] * f1 +
                               src[iX + 2] * f2 +
                               src[iX + 3] * f3 + 64) >> 7);
@@ -333,7 +514,32 @@ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int offsets[64];
+    int xsteps[64];
+    int offset;
+    int i, j;
+    int iY;
+
+    for (i = 0; i < bsx; i++) {
+        xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &offsets[i]);
+    }
+
+    for (j = 0; j < bsy; j++) {
+        for (i = 0; i < bsx; i++) {
+            iY     = j + xsteps[i];
+            offset = offsets[i];
+            dst[i] = (pel8_t)((src[-iY    ] * (32 - offset) +
+                              src[-iY - 1] * (64 - offset) +
+                              src[-iY - 2] * (32 + offset) +
+                              src[-iY - 3] * (     offset) + 64) >> 7);
+        }
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_y_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int offsets[64];
     int xsteps[64];
@@ -349,7 +555,7 @@ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
         for (i = 0; i < bsx; i++) {
             iY     = j + xsteps[i];
             offset = offsets[i];
-            dst[i] = (pel_t)((src[-iY    ] * (32 - offset) +
+            dst[i] = (pel10_t)((src[-iY    ] * (32 - offset) +
                               src[-iY - 1] * (64 - offset) +
                               src[-iY - 2] * (32 + offset) +
                               src[-iY - 3] * (     offset) + 64) >> 7);
@@ -360,7 +566,7 @@ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(int xoffsets[64]);
     ALIGN16(int xsteps[64]);
@@ -378,13 +584,13 @@ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
             iYy = j - xsteps[i];
 
             if (iYy <= -1) {
-                dst[i] = (pel_t)((src[ iXx + 2] * (32 - offsetx) +
+                dst[i] = (pel8_t)((src[ iXx + 2] * (32 - offsetx) +
                                   src[ iXx + 1] * (64 - offsetx) +
                                   src[ iXx    ] * (32 + offsetx) +
                                   src[ iXx - 1] * (     offsetx) + 64) >> 7);
             } else {
                 offsety = xoffsets[i];
-                dst[i] = (pel_t)((src[-iYy - 2] * (32 - offsety) +
+                dst[i] = (pel8_t)((src[-iYy - 2] * (32 - offsety) +
                                   src[-iYy - 1] * (64 - offsety) +
                                   src[-iYy    ] * (32 + offsety) +
                                   src[-iYy + 1] * (     offsety) + 64) >> 7);
@@ -395,18 +601,84 @@ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
     }
 }
 
+static void intra_pred_ang10_xy_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(int xoffsets[64]);
+    ALIGN16(int xsteps[64]);
+    int i, j, iXx, iYy;
+    int offsetx, offsety;
+
+    for (i = 0; i < bsx; i++) {
+        xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &xoffsets[i]);
+    }
+
+    for (j = 0; j < bsy; j++) {
+        iXx = -get_context_pixel(dir_mode, 0, j + 1, &offsetx);
+
+        for (i = 0; i < bsx; i++) {
+            iYy = j - xsteps[i];
+
+            if (iYy <= -1) {
+                dst[i] = (pel10_t)((src[ iXx + 2] * (32 - offsetx) +
+                                  src[ iXx + 1] * (64 - offsetx) +
+                                  src[ iXx    ] * (32 + offsetx) +
+                                  src[ iXx - 1] * (     offsetx) + 64) >> 7);
+            } else {
+                offsety = xoffsets[i];
+                dst[i] = (pel10_t)((src[-iYy - 2] * (32 - offsety) +
+                                  src[-iYy - 1] * (64 - offsety) +
+                                  src[-iYy    ] * (32 + offsety) +
+                                  src[-iYy + 1] * (     offsety) + 64) >> 7);
+            }
+            iXx++;
+        }
+        dst += i_dst;
+    }
+}
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_3_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[(64 + 176) << 2]);
+    int line_size = bsx + (bsy >> 2) * 11 - 1;
+
+    int aligned_line_size = 64 + 176;
+    int i_dst4 = i_dst << 2;
+    int i;
+    pel8_t *pfirst[4];
+
+    pfirst[0] = first_line;
+    pfirst[1] = pfirst[0] + aligned_line_size;
+    pfirst[2] = pfirst[1] + aligned_line_size;
+    pfirst[3] = pfirst[2] + aligned_line_size;
+
+    for (i = 0; i < line_size; i++, src++) {
+        pfirst[0][i] = (pel8_t)((    src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
+        pfirst[1][i] = (pel8_t)((    src[5] + 3 * src[6] + 3 * src[7] +     src[8] + 4) >> 3);
+        pfirst[2][i] = (pel8_t)((3 * src[8] + 7 * src[9] + 5 * src[10] +     src[11] + 8) >> 4);
+        pfirst[3][i] = (pel8_t)((    src[11] + 2 * src[12] +   src[13] + 0 * src[14] + 2) >> 2);
+    }
+
+    bsy >>= 2;
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst            , pfirst[0] + i * 11, bsx * sizeof(pel8_t));
+        memcpy(dst +     i_dst, pfirst[1] + i * 11, bsx * sizeof(pel8_t));
+        memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel8_t));
+        memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel8_t));
+        dst += i_dst4;
+    }
+}
+
+static void intra_pred_ang10_x_3_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[(64 + 176) << 2]);
+    ALIGN16(pel10_t first_line[(64 + 176) << 2]);
     int line_size = bsx + (bsy >> 2) * 11 - 1;
 
     int aligned_line_size = 64 + 176;
     int i_dst4 = i_dst << 2;
     int i;
-    pel_t *pfirst[4];
+    pel10_t *pfirst[4];
 
     pfirst[0] = first_line;
     pfirst[1] = pfirst[0] + aligned_line_size;
@@ -414,62 +686,80 @@ static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
     pfirst[3] = pfirst[2] + aligned_line_size;
 
     for (i = 0; i < line_size; i++, src++) {
-        pfirst[0][i] = (pel_t)((    src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
-        pfirst[1][i] = (pel_t)((    src[5] + 3 * src[6] + 3 * src[7] +     src[8] + 4) >> 3);
-        pfirst[2][i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] +     src[11] + 8) >> 4);
-        pfirst[3][i] = (pel_t)((    src[11] + 2 * src[12] +   src[13] + 0 * src[14] + 2) >> 2);
+        pfirst[0][i] = (pel10_t)((    src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
+        pfirst[1][i] = (pel10_t)((    src[5] + 3 * src[6] + 3 * src[7] +     src[8] + 4) >> 3);
+        pfirst[2][i] = (pel10_t)((3 * src[8] + 7 * src[9] + 5 * src[10] +     src[11] + 8) >> 4);
+        pfirst[3][i] = (pel10_t)((    src[11] + 2 * src[12] +   src[13] + 0 * src[14] + 2) >> 2);
     }
 
     bsy >>= 2;
     for (i = 0; i < bsy; i++) {
-        memcpy(dst            , pfirst[0] + i * 11, bsx * sizeof(pel_t));
-        memcpy(dst +     i_dst, pfirst[1] + i * 11, bsx * sizeof(pel_t));
-        memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel_t));
-        memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel_t));
+        memcpy(dst            , pfirst[0] + i * 11, bsx * sizeof(pel10_t));
+        memcpy(dst +     i_dst, pfirst[1] + i * 11, bsx * sizeof(pel10_t));
+        memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel10_t));
+        memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel10_t));
         dst += i_dst4;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_4_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_4_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[64 + 128]);
+    int line_size = bsx + ((bsy - 1) << 1);
+    int iHeight2 = bsy << 1;
+    int i;
+
+    src += 3;
+    for (i = 0; i < line_size; i++, src++) {
+        first_line[i] = (pel8_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2);
+    }
+
+    for (i = 0; i < iHeight2; i += 2) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_x_4_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[64 + 128]);
+    ALIGN16(pel10_t first_line[64 + 128]);
     int line_size = bsx + ((bsy - 1) << 1);
     int iHeight2 = bsy << 1;
     int i;
 
     src += 3;
     for (i = 0; i < line_size; i++, src++) {
-        first_line[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2);
+        first_line[i] = (pel10_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2);
     }
 
     for (i = 0; i < iHeight2; i += 2) {
-        memcpy(dst, first_line + i, bsx * sizeof(pel_t));
+        memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
         dst += i_dst;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_5_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (((bsy > 4) && (bsx > 8))) {
-        ALIGN16(pel_t first_line[(64 + 80) << 3]);
+        ALIGN16(pel8_t first_line[(64 + 80) << 3]);
         int line_size = bsx + (((bsy - 8) * 11) >> 3);
         int aligned_line_size = ((line_size + 15) >> 4) << 4;
-        pel_t *pfirst[8];
+        pel8_t *pfirst[8];
 
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
 
         pfirst[0] = first_line;
         pfirst[1] = pfirst[0] + aligned_line_size;
@@ -481,27 +771,27 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
         pfirst[7] = pfirst[6] + aligned_line_size;
 
         for (i = 0; i < line_size; src++, i++) {
-            pfirst[0][i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
-            pfirst[1][i] = (pel_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
-            pfirst[2][i] = (pel_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
-            pfirst[3][i] = (pel_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+            pfirst[0][i] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            pfirst[1][i] = (pel8_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            pfirst[2][i] = (pel8_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            pfirst[3][i] = (pel8_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
 
-            pfirst[4][i] = (pel_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
-            pfirst[5][i] = (pel_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] +  8) >> 4);
-            pfirst[6][i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
-            pfirst[7][i] = (pel_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
+            pfirst[4][i] = (pel8_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
+            pfirst[5][i] = (pel8_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] +  8) >> 4);
+            pfirst[6][i] = (pel8_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
+            pfirst[7][i] = (pel8_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
         }
 
         bsy  >>= 3;
         for (i = 0; i < bsy; i++) {
-            memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t));
-            memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t));
+            memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel8_t));
+            memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel8_t));
 
             dst1 = dst8 + i_dst;
             dst2 = dst1 + i_dst;
@@ -513,41 +803,41 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
             dst8 = dst7 + i_dst;
         }
     } else if (bsx == 16) {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
 
         for (i = 0; i < bsx; i++, src++) {
-            dst1[i]  = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
-            dst2[i]  = (pel_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
-            dst3[i]  = (pel_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
-            dst4[i]  = (pel_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+            dst1[i]  = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel8_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel8_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel8_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
         }
     } else if (bsx == 8) {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
 
         for (i = 0; i < 8; src++, i++) {
-            dst1[i]  = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
-            dst2[i]  = (pel_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
-            dst3[i]  = (pel_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
-            dst4[i]  = (pel_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+            dst1[i]  = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel8_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel8_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel8_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
 
-            dst5[i] = (pel_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
-            dst6[i] = (pel_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] + 8) >> 4);
-            dst7[i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
-            dst8[i] = (pel_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
+            dst5[i] = (pel8_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
+            dst6[i] = (pel8_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] + 8) >> 4);
+            dst7[i] = (pel8_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
+            dst8[i] = (pel8_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
         }
         if (bsy == 32) {
             //src -> 8,src[8] -> 16
-            pel_t pad1 = src[8];
+            pel8_t pad1 = src[8];
             dst1 = dst8 + i_dst;
             int j;
             for (j = 0; j < 24; j++) {
@@ -562,32 +852,32 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
             dst3 = dst2 + i_dst;
 
             src += 4;
-            dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
-            dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
-            dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5);
-            dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5);
-            dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4);
-            dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
-            dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4);
-            dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] +  9 * src[5] +     src[6] + 16) >> 5);
+            dst1[0] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            dst1[1] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst1[2] = (pel8_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5);
+            dst1[3] = (pel8_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5);
+            dst2[0] = (pel8_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4);
+            dst2[1] = (pel8_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst2[2] = (pel8_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4);
+            dst3[0] = (pel8_t)((7 * src[3] + 15 * src[4] +  9 * src[5] +     src[6] + 16) >> 5);
         }
     } else {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
 
         for (i = 0; i < 4; i++, src++) {
-            dst1[i]  = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
-            dst2[i]  = (pel_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
-            dst3[i]  = (pel_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
-            dst4[i]  = (pel_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+            dst1[i]  = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel8_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel8_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel8_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
         }
         if (bsy == 16) {
-            pel_t *dst5 = dst4 + i_dst;
+            pel8_t *dst5 = dst4 + i_dst;
 
             src += 4;
-            pel_t pad1 = src[0];
+            pel8_t pad1 = src[0];
 
             int j;
             for (j = 0; j < 12; j++) {
@@ -597,107 +887,332 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
                 dst5 += i_dst;
             }
             dst5 = dst4 + i_dst;
-            dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
-            dst5[1] = (pel_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
+            dst5[0] = (pel8_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
+            dst5[1] = (pel8_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
         }
     }
 }
 
-/* ---------------------------------------------------------------------------
- */
-static void intra_pred_ang_x_6_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang10_x_5_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[64 + 64]);
-    int line_size = bsx + bsy - 1;
     int i;
 
-    for (i = 0; i < line_size; i++, src++) {
-        first_line[i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
-    }
+    if (((bsy > 4) && (bsx > 8))) {
+        ALIGN16(pel10_t first_line[(64 + 80) << 3]);
+        int line_size = bsx + (((bsy - 8) * 11) >> 3);
+        int aligned_line_size = ((line_size + 15) >> 4) << 4;
+        pel10_t *pfirst[8];
 
-    for (i = 0; i < bsy; i++) {
-        memcpy(dst, first_line + i, bsx * sizeof(pel_t));
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+
+        pfirst[0] = first_line;
+        pfirst[1] = pfirst[0] + aligned_line_size;
+        pfirst[2] = pfirst[1] + aligned_line_size;
+        pfirst[3] = pfirst[2] + aligned_line_size;
+        pfirst[4] = pfirst[3] + aligned_line_size;
+        pfirst[5] = pfirst[4] + aligned_line_size;
+        pfirst[6] = pfirst[5] + aligned_line_size;
+        pfirst[7] = pfirst[6] + aligned_line_size;
+
+        for (i = 0; i < line_size; src++, i++) {
+            pfirst[0][i] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            pfirst[1][i] = (pel10_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            pfirst[2][i] = (pel10_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            pfirst[3][i] = (pel10_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+
+            pfirst[4][i] = (pel10_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
+            pfirst[5][i] = (pel10_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] +  8) >> 4);
+            pfirst[6][i] = (pel10_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
+            pfirst[7][i] = (pel10_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
+        }
+
+        bsy  >>= 3;
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel10_t));
+            memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel10_t));
+
+            dst1 = dst8 + i_dst;
+            dst2 = dst1 + i_dst;
+            dst3 = dst2 + i_dst;
+            dst4 = dst3 + i_dst;
+            dst5 = dst4 + i_dst;
+            dst6 = dst5 + i_dst;
+            dst7 = dst6 + i_dst;
+            dst8 = dst7 + i_dst;
+        }
+    } else if (bsx == 16) {
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+
+        for (i = 0; i < bsx; i++, src++) {
+            dst1[i]  = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel10_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel10_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel10_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+        }
+    } else if (bsx == 8) {
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+
+        for (i = 0; i < 8; src++, i++) {
+            dst1[i]  = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel10_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel10_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel10_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+
+            dst5[i] = (pel10_t)((     src[6] +  9 * src[7]  + 15 * src[8]  +  7 * src[9]  + 16) >> 5);
+            dst6[i] = (pel10_t)(( 3 * src[8] +  7 * src[9]  +  5 * src[10] +      src[11] + 8) >> 4);
+            dst7[i] = (pel10_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] +  5 * src[12] + 16) >> 5);
+            dst8[i] = (pel10_t)((    src[11] +  2 * src[12] +      src[13]                 + 2) >> 2);
+        }
+        if (bsy == 32) {
+            //src -> 8,src[8] -> 16
+            pel10_t pad1 = src[8];
+            dst1 = dst8 + i_dst;
+            int j;
+            for (j = 0; j < 24; j++) {
+                for (i = 0; i < 8; i++) {
+                    dst1[i] = pad1;
+                }
+                dst1 += i_dst;
+            }
+
+            dst1 = dst8 + i_dst;
+            dst2 = dst1 + i_dst;
+            dst3 = dst2 + i_dst;
+
+            src += 4;
+            dst1[0] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            dst1[1] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst1[2] = (pel10_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5);
+            dst1[3] = (pel10_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5);
+            dst2[0] = (pel10_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4);
+            dst2[1] = (pel10_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst2[2] = (pel10_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4);
+            dst3[0] = (pel10_t)((7 * src[3] + 15 * src[4] +  9 * src[5] +     src[6] + 16) >> 5);
+        }
+    } else {
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+
+        for (i = 0; i < 4; i++, src++) {
+            dst1[i]  = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5);
+            dst2[i]  = (pel10_t)((    src[2] +  5 * src[3] +  7 * src[4] + 3 * src[5] + 8) >> 4);
+            dst3[i]  = (pel10_t)((7 * src[4] + 15 * src[5] +  9 * src[6] +     src[7] + 16) >> 5);
+            dst4[i]  = (pel10_t)((    src[5] +  3 * src[6] +  3 * src[7] +     src[8] + 4) >> 3);
+        }
+        if (bsy == 16) {
+            pel10_t *dst5 = dst4 + i_dst;
+
+            src += 4;
+            pel10_t pad1 = src[0];
+
+            int j;
+            for (j = 0; j < 12; j++) {
+                for (i = 0; i < 4; i++) {
+                    dst5[i] = pad1;
+                }
+                dst5 += i_dst;
+            }
+            dst5 = dst4 + i_dst;
+            dst5[0] = (pel10_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
+            dst5[1] = (pel10_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
+        }
+    }
+}
+
+/* ---------------------------------------------------------------------------
+ */
+static void intra_pred_ang8_x_6_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[64 + 64]);
+    int line_size = bsx + bsy - 1;
+    int i;
+
+    for (i = 0; i < line_size; i++, src++) {
+        first_line[i] = (pel8_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_x_6_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel10_t first_line[64 + 64]);
+    int line_size = bsx + bsy - 1;
+    int i;
+
+    for (i = 0; i < line_size; i++, src++) {
+        first_line[i] = (pel10_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
         dst += i_dst;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_7_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_7_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
-    pel_t *dst1 = dst;
-    pel_t *dst2 = dst1 + i_dst;
-    pel_t *dst3 = dst2 + i_dst;
-    pel_t *dst4 = dst3 + i_dst;
+    pel8_t *dst1 = dst;
+    pel8_t *dst2 = dst1 + i_dst;
+    pel8_t *dst3 = dst2 + i_dst;
+    pel8_t *dst4 = dst3 + i_dst;
     if (bsy == 4) {
         for (i = 0; i < bsx; src++, i++) {
-            dst1[i] = (pel_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
-            dst2[i] = (pel_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
-            dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
-            dst4[i] = (pel_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
+            dst1[i] = (pel8_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
+            dst2[i] = (pel8_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
+            dst3[i] = (pel8_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
+            dst4[i] = (pel8_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
         }
     } else if (bsy == 8) {
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
         for (i = 0; i < bsx; src++, i++) {
-            dst1[i] = (pel_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
-            dst2[i] = (pel_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
-            dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
-            dst4[i] = (pel_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
-            dst5[i] = (pel_t)((src[3] *  3 + src[4] * 11 + src[5] * 13 + src[6] *  5 + 16) >> 5);
-            dst6[i] = (pel_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7);
-            dst7[i] = (pel_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32)      >> 6);
-            dst8[i] = (pel_t)((src[5] *  3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6);
+            dst1[i] = (pel8_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
+            dst2[i] = (pel8_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
+            dst3[i] = (pel8_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
+            dst4[i] = (pel8_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
+            dst5[i] = (pel8_t)((src[3] *  3 + src[4] * 11 + src[5] * 13 + src[6] *  5 + 16) >> 5);
+            dst6[i] = (pel8_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7);
+            dst7[i] = (pel8_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32)      >> 6);
+            dst8[i] = (pel8_t)((src[5] *  3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6);
         }
     } else {
-        intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy);
+        intra_pred_ang8_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
+    }
+}
+
+static void intra_pred_ang10_x_7_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+    pel10_t *dst1 = dst;
+    pel10_t *dst2 = dst1 + i_dst;
+    pel10_t *dst3 = dst2 + i_dst;
+    pel10_t *dst4 = dst3 + i_dst;
+    if (bsy == 4) {
+        for (i = 0; i < bsx; src++, i++) {
+            dst1[i] = (pel10_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
+            dst2[i] = (pel10_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
+            dst3[i] = (pel10_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
+            dst4[i] = (pel10_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
+        }
+    } else if (bsy == 8) {
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+        for (i = 0; i < bsx; src++, i++) {
+            dst1[i] = (pel10_t)((src[0] *  9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7);
+            dst2[i] = (pel10_t)((src[1] *  9 + src[2] * 25 + src[3] * 23 + src[4] *  7 + 32) >> 6);
+            dst3[i] = (pel10_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] *  5 + 64) >> 7);
+            dst4[i] = (pel10_t)((src[2] *  3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7);
+            dst5[i] = (pel10_t)((src[3] *  3 + src[4] * 11 + src[5] * 13 + src[6] *  5 + 16) >> 5);
+            dst6[i] = (pel10_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7);
+            dst7[i] = (pel10_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32)      >> 6);
+            dst8[i] = (pel10_t)((src[5] *  3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6);
+        }
+    } else {
+        intra_pred_ang10_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_8_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[2 * (64 + 32)]);
+    int line_size = bsx + (bsy >> 1) - 1;
+    int aligned_line_size = ((line_size + 15) >> 4) << 4;
+    int i_dst2 = i_dst << 1;
+    int i;
+    pel8_t *pfirst[2];
+
+    pfirst[0] = first_line;
+    pfirst[1] = first_line + aligned_line_size;
+    for (i = 0; i < line_size; i++, src++) {
+        pfirst[0][i] = (pel8_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3);
+        pfirst[1][i] = (pel8_t)((src[1] + (src[2] << 1)         + src[3] + 2) >> 2);
+    }
+
+    bsy >>= 1;
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel8_t));
+        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t));
+        dst += i_dst2;
+    }
+}
+
+static void intra_pred_ang10_x_8_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[2 * (64 + 32)]);
+    ALIGN16(pel10_t first_line[2 * (64 + 32)]);
     int line_size = bsx + (bsy >> 1) - 1;
     int aligned_line_size = ((line_size + 15) >> 4) << 4;
     int i_dst2 = i_dst << 1;
     int i;
-    pel_t *pfirst[2];
+    pel10_t *pfirst[2];
 
     pfirst[0] = first_line;
     pfirst[1] = first_line + aligned_line_size;
     for (i = 0; i < line_size; i++, src++) {
-        pfirst[0][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3);
-        pfirst[1][i] = (pel_t)((src[1] + (src[2] << 1)         + src[3] + 2) >> 2);
+        pfirst[0][i] = (pel10_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3);
+        pfirst[1][i] = (pel10_t)((src[1] + (src[2] << 1)         + src[3] + 2) >> 2);
     }
 
     bsy >>= 1;
     for (i = 0; i < bsy; i++) {
-        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel_t));
-        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t));
+        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel10_t));
+        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t));
         dst += i_dst2;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_9_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     if (bsy > 8) {
-        intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy);
+        intra_pred_ang8_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
         /*
-        ALIGN16(pel_t first_line[(64 + 32) * 11]);
+        ALIGN16(pel8_t first_line[(64 + 32) * 11]);
         int line_size = bsx + (bsy * 93 >> 8) - 1;
         int real_size = XAVS2_MIN(line_size, bsx * 2);
         int aligned_line_size = ((line_size + 31) >> 5) << 5;
         int i_dst11 = i_dst * 11;
         int i;
-        pel_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11;
-        pel_t *pfirst[11];
+        pel8_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11;
+        pel8_t *pfirst[11];
 
         pfirst[0] = first_line;
         pfirst[1] = pfirst[0] + aligned_line_size;
@@ -711,17 +1226,17 @@ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
         pfirst[9] = pfirst[8] + aligned_line_size;
         pfirst[10] = pfirst[9] + aligned_line_size;
         for (i = 0; i < real_size; i++, src++) {
-            pfirst[0][i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
-            pfirst[1][i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
-            pfirst[2][i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6);
-            pfirst[3][i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6);
-            pfirst[4][i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
-            pfirst[5][i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7);
-            pfirst[6][i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
-            pfirst[7][i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
-            pfirst[8][i] = (pel_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4);
-            pfirst[9][i] = (pel_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5);
-            pfirst[10][i] = (pel_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7);
+            pfirst[0][i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            pfirst[1][i] = (pel8_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            pfirst[2][i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6);
+            pfirst[3][i] = (pel8_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6);
+            pfirst[4][i] = (pel8_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
+            pfirst[5][i] = (pel8_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7);
+            pfirst[6][i] = (pel8_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
+            pfirst[7][i] = (pel8_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
+            pfirst[8][i] = (pel8_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4);
+            pfirst[9][i] = (pel8_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5);
+            pfirst[10][i] = (pel8_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7);
         }
 
         // padding
@@ -776,75 +1291,266 @@ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
         int bsy_b = bsy / 11;
         for (i = 0; i < bsy_b; i++) {
-            memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t));
-            memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[0] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel8_t));
             dst += i_dst11;
         }
         int bsy_r = bsy - bsy_b * 11;
         for (i = 0; i < bsy_r; i++) {
-            memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel8_t));
             dst += i_dst;
         }
         */
     } else if (bsy == 8) {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
         for (int i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
-            dst2[i] = (pel_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
-            dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
-            dst4[i] = (pel_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
+            dst1[i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            dst2[i] = (pel8_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            dst3[i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
+            dst4[i] = (pel8_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
 
-            dst5[i] = (pel_t)((3  * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
-            dst6[i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5  * src[5] + 64) >> 7);
-            dst7[i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
-            dst8[i] = (pel_t)((3  * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
+            dst5[i] = (pel8_t)((3  * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
+            dst6[i] = (pel8_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5  * src[5] + 64) >> 7);
+            dst7[i] = (pel8_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
+            dst8[i] = (pel8_t)((3  * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
         }
     } else { /*if (bsy == 4)*/
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
         for (int i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
-            dst2[i] = (pel_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
-            dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
-            dst4[i] = (pel_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
+            dst1[i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            dst2[i] = (pel8_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            dst3[i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
+            dst4[i] = (pel8_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
         }
     }
+}
+
+static void intra_pred_ang10_x_9_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    if (bsy > 8) {
+        intra_pred_ang10_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
+        /*
+        ALIGN16(pel10_t first_line[(64 + 32) * 11]);
+        int line_size = bsx + (bsy * 93 >> 8) - 1;
+        int real_size = XAVS2_MIN(line_size, bsx * 2);
+        int aligned_line_size = ((line_size + 31) >> 5) << 5;
+        int i_dst11 = i_dst * 11;
+        int i;
+        pel10_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11;
+        pel10_t *pfirst[11];
+
+        pfirst[0] = first_line;
+        pfirst[1] = pfirst[0] + aligned_line_size;
+        pfirst[2] = pfirst[1] + aligned_line_size;
+        pfirst[3] = pfirst[2] + aligned_line_size;
+        pfirst[4] = pfirst[3] + aligned_line_size;
+        pfirst[5] = pfirst[4] + aligned_line_size;
+        pfirst[6] = pfirst[5] + aligned_line_size;
+        pfirst[7] = pfirst[6] + aligned_line_size;
+        pfirst[8] = pfirst[7] + aligned_line_size;
+        pfirst[9] = pfirst[8] + aligned_line_size;
+        pfirst[10] = pfirst[9] + aligned_line_size;
+        for (i = 0; i < real_size; i++, src++) {
+            pfirst[0][i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            pfirst[1][i] = (pel10_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            pfirst[2][i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6);
+            pfirst[3][i] = (pel10_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6);
+            pfirst[4][i] = (pel10_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
+            pfirst[5][i] = (pel10_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7);
+            pfirst[6][i] = (pel10_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
+            pfirst[7][i] = (pel10_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
+            pfirst[8][i] = (pel10_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4);
+            pfirst[9][i] = (pel10_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5);
+            pfirst[10][i] = (pel10_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7);
+        }
+
+        // padding
+        if (real_size < line_size) {
+            pfirst[8][real_size - 3] = pfirst[8][real_size - 4];
+            pfirst[9][real_size - 3] = pfirst[9][real_size - 4];
+            pfirst[10][real_size - 3] = pfirst[10][real_size - 4];
+            pfirst[8][real_size - 2] = pfirst[8][real_size - 3];
+            pfirst[9][real_size - 2] = pfirst[9][real_size - 3];
+            pfirst[10][real_size - 2] = pfirst[10][real_size - 3];
+            pfirst[8][real_size - 1] = pfirst[8][real_size - 2];
+            pfirst[9][real_size - 1] = pfirst[9][real_size - 2];
+            pfirst[10][real_size - 1] = pfirst[10][real_size - 2];
+
+            pfirst[5][real_size - 2] = pfirst[5][real_size - 3];
+            pfirst[6][real_size - 2] = pfirst[6][real_size - 3];
+            pfirst[7][real_size - 2] = pfirst[7][real_size - 3];
+            pfirst[5][real_size - 1] = pfirst[5][real_size - 2];
+            pfirst[6][real_size - 1] = pfirst[6][real_size - 2];
+            pfirst[7][real_size - 1] = pfirst[7][real_size - 2];
+
+            pfirst[2][real_size - 1] = pfirst[2][real_size - 2];
+            pfirst[3][real_size - 1] = pfirst[3][real_size - 2];
+            pfirst[4][real_size - 1] = pfirst[4][real_size - 2];
+
+
+            pad1 = pfirst[0][real_size - 1];
+            pad2 = pfirst[1][real_size - 1];
+            pad3 = pfirst[2][real_size - 1];
+            pad4 = pfirst[3][real_size - 1];
+            pad5 = pfirst[4][real_size - 1];
+            pad6 = pfirst[5][real_size - 1];
+            pad7 = pfirst[6][real_size - 1];
+            pad8 = pfirst[7][real_size - 1];
+            pad9 = pfirst[8][real_size - 1];
+            pad10 = pfirst[9][real_size - 1];
+            pad11 = pfirst[10][real_size - 1];
+            for (; i < line_size; i++) {
+                pfirst[0][i] = pad1;
+                pfirst[1][i] = pad2;
+                pfirst[2][i] = pad3;
+                pfirst[3][i] = pad4;
+                pfirst[4][i] = pad5;
+                pfirst[5][i] = pad6;
+                pfirst[6][i] = pad7;
+                pfirst[7][i] = pad8;
+                pfirst[8][i] = pad9;
+                pfirst[9][i] = pad10;
+                pfirst[10][i] = pad11;
+            }
+        }
+
+        int bsy_b = bsy / 11;
+        for (i = 0; i < bsy_b; i++) {
+            memcpy(dst, pfirst[0] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel10_t));
+            dst += i_dst11;
+        }
+        int bsy_r = bsy - bsy_b * 11;
+        for (i = 0; i < bsy_r; i++) {
+            memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel10_t));
+            dst += i_dst;
+        }
+        */
+    } else if (bsy == 8) {
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+        for (int i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            dst2[i] = (pel10_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            dst3[i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
+            dst4[i] = (pel10_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
 
+            dst5[i] = (pel10_t)((3  * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6);
+            dst6[i] = (pel10_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5  * src[5] + 64) >> 7);
+            dst7[i] = (pel10_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7);
+            dst8[i] = (pel10_t)((3  * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7);
+        }
+    } else { /*if (bsy == 4)*/
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        for (int i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7);
+            dst2[i] = (pel10_t)((9  * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7);
+            dst3[i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] +      src[4] + 32) >> 6);
+            dst4[i] = (pel10_t)((9  * src[1] + 25 * src[2] + 23 * src[3] + 7  * src[4] + 32) >> 6);
+        }
+    }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_10_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    pel8_t *dst1 = dst;
+    pel8_t *dst2 = dst1 + i_dst;
+    pel8_t *dst3 = dst2 + i_dst;
+    pel8_t *dst4 = dst3 + i_dst;
+    int i;
+
+    if (bsy != 4) {
+        ALIGN16(pel8_t first_line[4 * (64 + 16)]);
+        int line_size = bsx + bsy / 4 - 1;
+        int aligned_line_size = ((line_size + 15) >> 4) << 4;
+        pel8_t *pfirst[4];
+
+        pfirst[0] = first_line;
+        pfirst[1] = first_line + aligned_line_size;
+        pfirst[2] = first_line + aligned_line_size * 2;
+        pfirst[3] = first_line + aligned_line_size * 3;
+
+        for (i = 0; i < line_size; i++, src++) {
+            pfirst[0][i] = (pel8_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
+            pfirst[1][i] = (pel8_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
+            pfirst[2][i] = (pel8_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
+            pfirst[3][i] = (pel8_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
+        }
+
+        bsy   >>= 2;
+        i_dst <<= 2;
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel8_t));
+            memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel8_t));
+            memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel8_t));
+            memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel8_t));
+            dst1 += i_dst;
+            dst2 += i_dst;
+            dst3 += i_dst;
+            dst4 += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel8_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
+            dst2[i] = (pel8_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
+            dst3[i] = (pel8_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
+            dst4[i] = (pel8_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
+        }
+    }
+}
+
+static void intra_pred_ang10_x_10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    pel_t *dst1 = dst;
-    pel_t *dst2 = dst1 + i_dst;
-    pel_t *dst3 = dst2 + i_dst;
-    pel_t *dst4 = dst3 + i_dst;
+    pel10_t *dst1 = dst;
+    pel10_t *dst2 = dst1 + i_dst;
+    pel10_t *dst3 = dst2 + i_dst;
+    pel10_t *dst4 = dst3 + i_dst;
     int i;
 
     if (bsy != 4) {
-        ALIGN16(pel_t first_line[4 * (64 + 16)]);
+        ALIGN16(pel10_t first_line[4 * (64 + 16)]);
         int line_size = bsx + bsy / 4 - 1;
         int aligned_line_size = ((line_size + 15) >> 4) << 4;
-        pel_t *pfirst[4];
+        pel10_t *pfirst[4];
 
         pfirst[0] = first_line;
         pfirst[1] = first_line + aligned_line_size;
@@ -852,19 +1558,19 @@ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
         pfirst[3] = first_line + aligned_line_size * 3;
 
         for (i = 0; i < line_size; i++, src++) {
-            pfirst[0][i] = (pel_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
-            pfirst[1][i] = (pel_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
-            pfirst[2][i] = (pel_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
-            pfirst[3][i] = (pel_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
+            pfirst[0][i] = (pel10_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
+            pfirst[1][i] = (pel10_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
+            pfirst[2][i] = (pel10_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
+            pfirst[3][i] = (pel10_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
         }
 
         bsy   >>= 2;
         i_dst <<= 2;
         for (i = 0; i < bsy; i++) {
-            memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel_t));
-            memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel_t));
-            memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel_t));
-            memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel_t));
+            memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel10_t));
+            memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel10_t));
+            memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel10_t));
+            memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel10_t));
             dst1 += i_dst;
             dst2 += i_dst;
             dst3 += i_dst;
@@ -872,25 +1578,25 @@ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
         }
     } else {
         for (i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
-            dst2[i] = (pel_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
-            dst3[i] = (pel_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
-            dst4[i] = (pel_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
+            dst1[i] = (pel10_t)((src[0] * 3 +  src[1] * 7 + src[2]  * 5 + src[3]     + 8) >> 4);
+            dst2[i] = (pel10_t)((src[0]     + (src[1]     + src[2]) * 3 + src[3]     + 4) >> 3);
+            dst3[i] = (pel10_t)((src[0]     +  src[1] * 5 + src[2]  * 7 + src[3] * 3 + 8) >> 4);
+            dst4[i] = (pel10_t)((src[1]     +  src[2] * 2 + src[3]                   + 2) >> 2);
         }
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_x_11_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     if (bsy > 8) {
-        ALIGN16(pel_t first_line[(64 + 16) << 3]);
+        ALIGN16(pel8_t first_line[(64 + 16) << 3]);
         int line_size = bsx + (bsy >> 3) - 1;
         int aligned_line_size = ((line_size + 15) >> 4) << 4;
         int i_dst8 = i_dst << 3;
-        pel_t *pfirst[8];
+        pel8_t *pfirst[8];
 
         pfirst[0] = first_line;
         pfirst[1] = pfirst[0] + aligned_line_size;
@@ -901,108 +1607,230 @@ static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
         pfirst[6] = pfirst[5] + aligned_line_size;
         pfirst[7] = pfirst[6] + aligned_line_size;
         for (i = 0; i < line_size; i++, src++) {
-            pfirst[0][i] = (pel_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
-            pfirst[1][i] = (pel_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] +  8) >> 4);
-            pfirst[2][i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
-            pfirst[3][i] = (pel_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] +  4) >> 3);
+            pfirst[0][i] = (pel8_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
+            pfirst[1][i] = (pel8_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] +  8) >> 4);
+            pfirst[2][i] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            pfirst[3][i] = (pel8_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] +  4) >> 3);
 
-            pfirst[4][i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
-            pfirst[5][i] = (pel_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
-            pfirst[6][i] = (pel_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
-            pfirst[7][i] = (pel_t)((    src[1] +  2 * src[2] +      src[3] + 0 * src[4] +  2) >> 2);
+            pfirst[4][i] = (pel8_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
+            pfirst[5][i] = (pel8_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
+            pfirst[6][i] = (pel8_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
+            pfirst[7][i] = (pel8_t)((    src[1] +  2 * src[2] +      src[3] + 0 * src[4] +  2) >> 2);
         }
 
         bsy >>= 3;
         for (i = 0; i < bsy; i++) {
-            memcpy(dst            , pfirst[0] + i, bsx * sizeof(pel_t));
-            memcpy(dst +     i_dst, pfirst[1] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t));
-            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t));
+            memcpy(dst            , pfirst[0] + i, bsx * sizeof(pel8_t));
+            memcpy(dst +     i_dst, pfirst[1] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel8_t));
+            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel8_t));
             dst += i_dst8;
         }
     } else if (bsy == 8) {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
         for (i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
-            dst2[i] = (pel_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] + 8) >> 4);
-            dst3[i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
-            dst4[i] = (pel_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] + 4) >> 3);
+            dst1[i] = (pel8_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
+            dst2[i] = (pel8_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] + 8) >> 4);
+            dst3[i] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            dst4[i] = (pel8_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] + 4) >> 3);
 
-            dst5[i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
-            dst6[i] = (pel_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
-            dst7[i] = (pel_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
-            dst8[i] = (pel_t)((    src[1] +  2 * src[2] +      src[3] +            +  2) >> 2);
+            dst5[i] = (pel8_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
+            dst6[i] = (pel8_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
+            dst7[i] = (pel8_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
+            dst8[i] = (pel8_t)((    src[1] +  2 * src[2] +      src[3] +            +  2) >> 2);
         }
     } else {
         for (i = 0; i < bsx; i++, src++) {
-            pel_t *dst1 = dst;
-            pel_t *dst2 = dst1 + i_dst;
-            pel_t *dst3 = dst2 + i_dst;
-            pel_t *dst4 = dst3 + i_dst;
-            dst1[i] = (pel_t)(( 7 * src[0] + 15 * src[1] +  9 * src[2] +      src[3] + 16) >> 5);
-            dst2[i] = (pel_t)(( 3 * src[0] +  7 * src[1] +  5 * src[2] +      src[3] +  8) >> 4);
-            dst3[i] = (pel_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] +  3 * src[3] + 16) >> 5);
-            dst4[i] = (pel_t)((     src[0] +  3 * src[1] +  3 * src[2] +      src[3] +  4) >> 3);
+            pel8_t *dst1 = dst;
+            pel8_t *dst2 = dst1 + i_dst;
+            pel8_t *dst3 = dst2 + i_dst;
+            pel8_t *dst4 = dst3 + i_dst;
+            dst1[i] = (pel8_t)(( 7 * src[0] + 15 * src[1] +  9 * src[2] +      src[3] + 16) >> 5);
+            dst2[i] = (pel8_t)(( 3 * src[0] +  7 * src[1] +  5 * src[2] +      src[3] +  8) >> 4);
+            dst3[i] = (pel8_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] +  3 * src[3] + 16) >> 5);
+            dst4[i] = (pel8_t)((     src[0] +  3 * src[1] +  3 * src[2] +      src[3] +  4) >> 3);
         }
     }
 }
 
+static void intra_pred_ang10_x_11_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+    if (bsy > 8) {
+        ALIGN16(pel10_t first_line[(64 + 16) << 3]);
+        int line_size = bsx + (bsy >> 3) - 1;
+        int aligned_line_size = ((line_size + 15) >> 4) << 4;
+        int i_dst8 = i_dst << 3;
+        pel10_t *pfirst[8];
+
+        pfirst[0] = first_line;
+        pfirst[1] = pfirst[0] + aligned_line_size;
+        pfirst[2] = pfirst[1] + aligned_line_size;
+        pfirst[3] = pfirst[2] + aligned_line_size;
+        pfirst[4] = pfirst[3] + aligned_line_size;
+        pfirst[5] = pfirst[4] + aligned_line_size;
+        pfirst[6] = pfirst[5] + aligned_line_size;
+        pfirst[7] = pfirst[6] + aligned_line_size;
+        for (i = 0; i < line_size; i++, src++) {
+            pfirst[0][i] = (pel10_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
+            pfirst[1][i] = (pel10_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] +  8) >> 4);
+            pfirst[2][i] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            pfirst[3][i] = (pel10_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] +  4) >> 3);
+
+            pfirst[4][i] = (pel10_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
+            pfirst[5][i] = (pel10_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
+            pfirst[6][i] = (pel10_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
+            pfirst[7][i] = (pel10_t)((    src[1] +  2 * src[2] +      src[3] + 0 * src[4] +  2) >> 2);
+        }
+
+        bsy >>= 3;
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst            , pfirst[0] + i, bsx * sizeof(pel10_t));
+            memcpy(dst +     i_dst, pfirst[1] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel10_t));
+            memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel10_t));
+            dst += i_dst8;
+        }
+    } else if (bsy == 8) {
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+        for (i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel10_t)((7 * src[0] + 15 * src[1] +  9 * src[2] +     src[3] + 16) >> 5);
+            dst2[i] = (pel10_t)((3 * src[0] +  7 * src[1] +  5 * src[2] +     src[3] + 8) >> 4);
+            dst3[i] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5);
+            dst4[i] = (pel10_t)((    src[0] +  3 * src[1] +  3 * src[2] +     src[3] + 4) >> 3);
+
+            dst5[i] = (pel10_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5);
+            dst6[i] = (pel10_t)((    src[0] +  5 * src[1] +  7 * src[2] + 3 * src[3] +  8) >> 4);
+            dst7[i] = (pel10_t)((    src[0] +  9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5);
+            dst8[i] = (pel10_t)((    src[1] +  2 * src[2] +      src[3] +            +  2) >> 2);
+        }
+    } else {
+        for (i = 0; i < bsx; i++, src++) {
+            pel10_t *dst1 = dst;
+            pel10_t *dst2 = dst1 + i_dst;
+            pel10_t *dst3 = dst2 + i_dst;
+            pel10_t *dst4 = dst3 + i_dst;
+            dst1[i] = (pel10_t)(( 7 * src[0] + 15 * src[1] +  9 * src[2] +      src[3] + 16) >> 5);
+            dst2[i] = (pel10_t)(( 3 * src[0] +  7 * src[1] +  5 * src[2] +      src[3] +  8) >> 4);
+            dst3[i] = (pel10_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] +  3 * src[3] + 16) >> 5);
+            dst4[i] = (pel10_t)((     src[0] +  3 * src[1] +  3 * src[2] +      src[3] +  4) >> 3);
+        }
+    }
+}
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_25_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (bsx > 8) {
-        ALIGN16(pel_t first_line[64 + (64 << 3)]);
+        ALIGN16(pel8_t first_line[64 + (64 << 3)]);
         int line_size = bsx + ((bsy - 1) << 3);
         int iHeight8 = bsy << 3;
         for (i = 0; i < line_size; i += 8, src--) {
-            first_line[0 + i] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
-            first_line[1 + i] = (pel_t)((src[0] * 3 + src[-1] * 7  + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
-            first_line[2 + i] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
-            first_line[3 + i] = (pel_t)((src[0] * 1 + src[-1] * 3  + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+            first_line[0 + i] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            first_line[1 + i] = (pel8_t)((src[0] * 3 + src[-1] * 7  + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            first_line[2 + i] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            first_line[3 + i] = (pel8_t)((src[0] * 1 + src[-1] * 3  + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
 
-            first_line[4 + i] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
-            first_line[5 + i] = (pel_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
-            first_line[6 + i] = (pel_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
-            first_line[7 + i] = (pel_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
+            first_line[4 + i] = (pel8_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
+            first_line[5 + i] = (pel8_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
+            first_line[6 + i] = (pel8_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
+            first_line[7 + i] = (pel8_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
         }
         for (i = 0; i < iHeight8; i += 8) {
-            memcpy(dst, first_line + i, bsx * sizeof(pel_t));
+            memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
             dst += i_dst;
         }
     } else if (bsx == 8) {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
-            dst[1] = (pel_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
-            dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
-            dst[3] = (pel_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
-
-            dst[4] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
-            dst[5] = (pel_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
-            dst[6] = (pel_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
-            dst[7] = (pel_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
+            dst[0] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            dst[1] = (pel8_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            dst[2] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            dst[3] = (pel8_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+
+            dst[4] = (pel8_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
+            dst[5] = (pel8_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
+            dst[6] = (pel8_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
+            dst[7] = (pel8_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
             dst += i_dst;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
-            dst[1] = (pel_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
-            dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
-            dst[3] = (pel_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+            dst[0] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            dst[1] = (pel8_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            dst[2] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            dst[3] = (pel8_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+            dst += i_dst;
+        }
+    }
+}
+
+static void intra_pred_ang10_y_25_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+
+    if (bsx > 8) {
+        ALIGN16(pel10_t first_line[64 + (64 << 3)]);
+        int line_size = bsx + ((bsy - 1) << 3);
+        int iHeight8 = bsy << 3;
+        for (i = 0; i < line_size; i += 8, src--) {
+            first_line[0 + i] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            first_line[1 + i] = (pel10_t)((src[0] * 3 + src[-1] * 7  + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            first_line[2 + i] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            first_line[3 + i] = (pel10_t)((src[0] * 1 + src[-1] * 3  + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+
+            first_line[4 + i] = (pel10_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
+            first_line[5 + i] = (pel10_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
+            first_line[6 + i] = (pel10_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
+            first_line[7 + i] = (pel10_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
+        }
+        for (i = 0; i < iHeight8; i += 8) {
+            memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
+            dst += i_dst;
+        }
+    } else if (bsx == 8) {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            dst[1] = (pel10_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            dst[2] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            dst[3] = (pel10_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
+
+            dst[4] = (pel10_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5);
+            dst[5] = (pel10_t)((src[0] * 1 + src[-1] *  5 + src[-2] *  7 + src[-3] * 3 + 8) >> 4);
+            dst[6] = (pel10_t)((src[0] * 1 + src[-1] *  9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5);
+            dst[7] = (pel10_t)((             src[-1] *  1 + src[-2] *  2 + src[-3] * 1 + 2) >> 2);
+            dst += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] *  9 + src[-3] * 1 + 16) >> 5);
+            dst[1] = (pel10_t)((src[0] * 3 + src[-1] *  7 + src[-2] *  5 + src[-3] * 1 + 8) >> 4);
+            dst[2] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5);
+            dst[3] = (pel10_t)((src[0] * 1 + src[-1] *  3 + src[-2] *  3 + src[-3] * 1 + 4) >> 3);
             dst += i_dst;
         }
     }
@@ -1010,32 +1838,63 @@ static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_26_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (bsx != 4) {
-        ALIGN16(pel_t first_line[64 + 256]);
+        ALIGN16(pel8_t first_line[64 + 256]);
         int line_size = bsx + ((bsy - 1) << 2);
         int iHeight4 = bsy << 2;
 
         for (i = 0; i < line_size; i += 4, src--) {
-            first_line[i    ] = (pel_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
-            first_line[i + 1] = (pel_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
-            first_line[i + 2] = (pel_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
-            first_line[i + 3] = (pel_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
+            first_line[i    ] = (pel8_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
+            first_line[i + 1] = (pel8_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
+            first_line[i + 2] = (pel8_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
+            first_line[i + 3] = (pel8_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
         }
 
         for (i = 0; i < iHeight4; i += 4) {
-            memcpy(dst, first_line + i, bsx * sizeof(pel_t));
+            memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
             dst += i_dst;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
-            dst[1] = (pel_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
-            dst[2] = (pel_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
-            dst[3] = (pel_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
+            dst[0] = (pel8_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
+            dst[1] = (pel8_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
+            dst[2] = (pel8_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
+            dst[3] = (pel8_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
+            dst += i_dst;
+        }
+    }
+}
+
+static void intra_pred_ang10_y_26_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+
+    if (bsx != 4) {
+        ALIGN16(pel10_t first_line[64 + 256]);
+        int line_size = bsx + ((bsy - 1) << 2);
+        int iHeight4 = bsy << 2;
+
+        for (i = 0; i < line_size; i += 4, src--) {
+            first_line[i    ] = (pel10_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
+            first_line[i + 1] = (pel10_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
+            first_line[i + 2] = (pel10_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
+            first_line[i + 3] = (pel10_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
+        }
+
+        for (i = 0; i < iHeight4; i += 4) {
+            memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
+            dst += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel10_t)((src[ 0] * 3 +  src[-1] * 7 + src[-2]  * 5 + src[-3]     + 8) >> 4);
+            dst[1] = (pel10_t)((src[ 0]     + (src[-1]     + src[-2]) * 3 + src[-3]     + 4) >> 3);
+            dst[2] = (pel10_t)((src[ 0]     +  src[-1] * 5 + src[-2]  * 7 + src[-3] * 3 + 8) >> 4);
+            dst[3] = (pel10_t)((src[-1]     +  src[-2] * 2 + src[-3]                    + 2) >> 2);
             dst += i_dst;
         }
     }
@@ -1043,30 +1902,59 @@ static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_27_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+    if (bsx > 8) {
+        intra_pred_ang8_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
+    } else if (bsx == 8) {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((21 * src[0] +  53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
+            dst[1] = (pel8_t)(( 9 * src[0] +  41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
+            dst[2] = (pel8_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
+            dst[3] = (pel8_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
+
+            dst[4] = (pel8_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6);
+            dst[5] = (pel8_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] +  5 * src[-5] + 64) >> 7);
+            dst[6] = (pel8_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7);
+            dst[7] = (pel8_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7);
+            dst += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((21 * src[0]  + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
+            dst[1] = (pel8_t)(( 9 * src[0]  + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
+            dst[2] = (pel8_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
+            dst[3] = (pel8_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
+            dst += i_dst;
+        }
+    }
+}
+
+static void intra_pred_ang10_y_27_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     if (bsx > 8) {
-        intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy);
+        intra_pred_ang10_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
     } else if (bsx == 8) {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((21 * src[0] +  53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
-            dst[1] = (pel_t)(( 9 * src[0] +  41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
-            dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
-            dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
-
-            dst[4] = (pel_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6);
-            dst[5] = (pel_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] +  5 * src[-5] + 64) >> 7);
-            dst[6] = (pel_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7);
-            dst[7] = (pel_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7);
+            dst[0] = (pel10_t)((21 * src[0] +  53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
+            dst[1] = (pel10_t)(( 9 * src[0] +  41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
+            dst[2] = (pel10_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
+            dst[3] = (pel10_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
+
+            dst[4] = (pel10_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6);
+            dst[5] = (pel10_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] +  5 * src[-5] + 64) >> 7);
+            dst[6] = (pel10_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7);
+            dst[7] = (pel10_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7);
             dst += i_dst;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((21 * src[0]  + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
-            dst[1] = (pel_t)(( 9 * src[0]  + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
-            dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
-            dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
+            dst[0] = (pel10_t)((21 * src[0]  + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7);
+            dst[1] = (pel10_t)(( 9 * src[0]  + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7);
+            dst[2] = (pel10_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] +  1 * src[-4] + 32) >> 6);
+            dst[3] = (pel10_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] +  7 * src[-4] + 32) >> 6);
             dst += i_dst;
         }
     }
@@ -1074,80 +1962,185 @@ static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_28_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_28_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[64 + 128]);
+    ALIGN16(pel8_t first_line[64 + 128]);
     int line_size = bsx + ((bsy - 1) << 1);
     int iHeight2 = bsy << 1;
     int i;
 
     for (i = 0; i < line_size; i += 2, src--) {
-        first_line[i    ] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3);
-        first_line[i + 1] = (pel_t)((src[-1] + (src[-2] << 1)          + src[-3] + 2) >> 2);
+        first_line[i    ] = (pel8_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3);
+        first_line[i + 1] = (pel8_t)((src[-1] + (src[-2] << 1)          + src[-3] + 2) >> 2);
     }
 
     for (i = 0; i < iHeight2; i += 2) {
-        memcpy(dst, first_line + i, bsx * sizeof(pel_t));
+        memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_y_28_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel10_t first_line[64 + 128]);
+    int line_size = bsx + ((bsy - 1) << 1);
+    int iHeight2 = bsy << 1;
+    int i;
+
+    for (i = 0; i < line_size; i += 2, src--) {
+        first_line[i    ] = (pel10_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3);
+        first_line[i + 1] = (pel10_t)((src[-1] + (src[-2] << 1)          + src[-3] + 2) >> 2);
+    }
+
+    for (i = 0; i < iHeight2; i += 2) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
         dst += i_dst;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_29_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_29_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+    if (bsx > 8) {
+        intra_pred_ang8_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
+    } else if (bsx == 8) {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
+            dst[1] = (pel8_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
+            dst[2] = (pel8_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
+            dst[3] = (pel8_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
+
+            dst[4] = (pel8_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5);
+            dst[5] = (pel8_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7);
+            dst[6] = (pel8_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6);
+            dst[7] = (pel8_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6);
+            dst += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
+            dst[1] = (pel8_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
+            dst[2] = (pel8_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
+            dst[3] = (pel8_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
+            dst += i_dst;
+        }
+    }
+}
+
+static void intra_pred_ang10_y_29_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     if (bsx > 8) {
-        intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy);
+        intra_pred_ang10_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy);
+    } else if (bsx == 8) {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel10_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
+            dst[1] = (pel10_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
+            dst[2] = (pel10_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
+            dst[3] = (pel10_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
+
+            dst[4] = (pel10_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5);
+            dst[5] = (pel10_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7);
+            dst[6] = (pel10_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6);
+            dst[7] = (pel10_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6);
+            dst += i_dst;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel10_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
+            dst[1] = (pel10_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
+            dst[2] = (pel10_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
+            dst[3] = (pel10_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
+            dst += i_dst;
+        }
+    }
+}
+
+/* ---------------------------------------------------------------------------
+ */
+static void intra_pred_ang8_y_30_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[64 + 64]);
+    int line_size = bsx + bsy - 1;
+    int i;
+
+    src -= 2;
+    for (i = 0; i < line_size; i++, src--) {
+        first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel8_t));
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_y_30_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel10_t first_line[64 + 64]);
+    int line_size = bsx + bsy - 1;
+    int i;
+
+    src -= 2;
+    for (i = 0; i < line_size; i++, src--) {
+        first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, first_line + i, bsx * sizeof(pel10_t));
+        dst += i_dst;
+    }
+}
+
+/* ---------------------------------------------------------------------------
+ */
+static void intra_pred_ang8_y_31_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]);
+    ALIGN16(pel8_t src_tran[MAX_CU_SIZE << 3]);
+    int i;
+    if (bsx >= bsy) {
+        // transposition
+        // i < (bsx * 19 / 8 + 3)
+        for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) {
+            src_tran[i] = src[-i];
+        }
+        intra_pred_ang8_x_5_c(h, src_tran, dst_tran, bsy, 5, bsy, bsx);
+        for (i = 0; i < bsy; i++) {
+            for (int j = 0; j < bsx; j++) {
+                dst[j + i_dst * i] = dst_tran[i + bsy * j];
+            }
+        }
     } else if (bsx == 8) {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
-            dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
-            dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
-            dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
-
-            dst[4] = (pel_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5);
-            dst[5] = (pel_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7);
-            dst[6] = (pel_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6);
-            dst[7] = (pel_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6);
+            dst[0] = (pel8_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
+            dst[1] = (pel8_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
+            dst[2] = (pel8_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
+            dst[3] = (pel8_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
+
+            dst[4] = (pel8_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5);
+            dst[5] = (pel8_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4);
+            dst[6] = (pel8_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5);
+            dst[7] = (pel8_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2);
             dst += i_dst;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7);
-            dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6);
-            dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7);
-            dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7);
+            dst[0] = (pel8_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
+            dst[1] = (pel8_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
+            dst[2] = (pel8_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
+            dst[3] = (pel8_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
             dst += i_dst;
         }
     }
 }
 
-/* ---------------------------------------------------------------------------
- */
-static void intra_pred_ang_y_30_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
-{
-    ALIGN16(pel_t first_line[64 + 64]);
-    int line_size = bsx + bsy - 1;
-    int i;
-
-    src -= 2;
-    for (i = 0; i < line_size; i++, src--) {
-        first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
-    }
-
-    for (i = 0; i < bsy; i++) {
-        memcpy(dst, first_line + i, bsx * sizeof(pel_t));
-        dst += i_dst;
-    }
-}
-
-/* ---------------------------------------------------------------------------
- */
-static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang10_y_31_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]);
-    ALIGN16(pel_t src_tran[MAX_CU_SIZE << 3]);
+    ALIGN16(pel10_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]);
+    ALIGN16(pel10_t src_tran[MAX_CU_SIZE << 3]);
     int i;
     if (bsx >= bsy) {
         // transposition
@@ -1155,7 +2148,7 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
         for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) {
             src_tran[i] = src[-i];
         }
-        intra_pred_ang_x_5_c(src_tran, dst_tran, bsy, 5, bsy, bsx);
+        intra_pred_ang10_x_5_c(h, src_tran, dst_tran, bsy, 5, bsy, bsx);
         for (i = 0; i < bsy; i++) {
             for (int j = 0; j < bsx; j++) {
                 dst[j + i_dst * i] = dst_tran[i + bsy * j];
@@ -1163,23 +2156,23 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
         }
     } else if (bsx == 8) {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
-            dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
-            dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
-            dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
-
-            dst[4] = (pel_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5);
-            dst[5] = (pel_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4);
-            dst[6] = (pel_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5);
-            dst[7] = (pel_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2);
+            dst[0] = (pel10_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
+            dst[1] = (pel10_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
+            dst[2] = (pel10_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
+            dst[3] = (pel10_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
+
+            dst[4] = (pel10_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5);
+            dst[5] = (pel10_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4);
+            dst[6] = (pel10_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5);
+            dst[7] = (pel10_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2);
             dst += i_dst;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
-            dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
-            dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
-            dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
+            dst[0] = (pel10_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5);
+            dst[1] = (pel10_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4);
+            dst[2] = (pel10_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5);
+            dst[3] = (pel10_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3);
             dst += i_dst;
         }
     }
@@ -1187,43 +2180,175 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_y_32_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_y_32_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[2 * (32 + 64)]);
+    int line_size = (bsy >> 1) + bsx - 1;
+    int aligned_line_size = ((line_size + 15) >> 4) << 4;
+    int i_dst2 = i_dst << 1;
+    int i;
+    pel8_t *pfirst[2];
+
+    pfirst[0] = first_line;
+    pfirst[1] = first_line + aligned_line_size;
+
+    src -= 3;
+    for (i = 0; i < line_size; i++, src -= 2) {
+        pfirst[0][i] = (pel8_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2);
+        pfirst[1][i] = (pel8_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2);
+    }
+
+    bsy >>= 1;
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel8_t));
+        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t));
+        dst += i_dst2;
+    }
+}
+
+static void intra_pred_ang10_y_32_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[2 * (32 + 64)]);
+    ALIGN16(pel10_t first_line[2 * (32 + 64)]);
     int line_size = (bsy >> 1) + bsx - 1;
     int aligned_line_size = ((line_size + 15) >> 4) << 4;
     int i_dst2 = i_dst << 1;
     int i;
-    pel_t *pfirst[2];
+    pel10_t *pfirst[2];
 
     pfirst[0] = first_line;
     pfirst[1] = first_line + aligned_line_size;
 
     src -= 3;
     for (i = 0; i < line_size; i++, src -= 2) {
-        pfirst[0][i] = (pel_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2);
-        pfirst[1][i] = (pel_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2);
+        pfirst[0][i] = (pel10_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2);
+        pfirst[1][i] = (pel10_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2);
     }
 
     bsy >>= 1;
     for (i = 0; i < bsy; i++) {
-        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel_t));
-        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t));
+        memcpy(dst        , pfirst[0] + i, bsx * sizeof(pel10_t));
+        memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t));
         dst += i_dst2;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_13_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+    if (bsy > 8) {
+        ALIGN16(pel8_t first_line[(64 + 16) << 3]);
+        int line_size = bsx + (bsy >> 3) - 1;
+        int left_size = line_size - bsx;
+        int aligned_line_size = ((line_size + 15) >> 4) << 4;
+        pel8_t *pfirst[8];
+
+        pfirst[0] = first_line;
+        pfirst[1] = pfirst[0] + aligned_line_size;
+        pfirst[2] = pfirst[1] + aligned_line_size;
+        pfirst[3] = pfirst[2] + aligned_line_size;
+        pfirst[4] = pfirst[3] + aligned_line_size;
+        pfirst[5] = pfirst[4] + aligned_line_size;
+        pfirst[6] = pfirst[5] + aligned_line_size;
+        pfirst[7] = pfirst[6] + aligned_line_size;
+
+        src -= bsy - 8;
+        for (i = 0; i < left_size; i++, src += 8) {
+            pfirst[0][i] = (pel8_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2);
+            pfirst[1][i] = (pel8_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2);
+            pfirst[2][i] = (pel8_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2);
+            pfirst[3][i] = (pel8_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2);
+
+            pfirst[4][i] = (pel8_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2);
+            pfirst[5][i] = (pel8_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
+            pfirst[6][i] = (pel8_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2);
+            pfirst[7][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        }
+
+        for (; i < line_size; i++, src++) {
+            pfirst[0][i] = (pel8_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
+            pfirst[1][i] = (pel8_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
+            pfirst[2][i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            pfirst[3][i] = (pel8_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
+
+            pfirst[4][i] = (pel8_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
+            pfirst[5][i] = (pel8_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
+            pfirst[6][i] = (pel8_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
+            pfirst[7][i] = (pel8_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2);
+        }
+
+        pfirst[0] += left_size;
+        pfirst[1] += left_size;
+        pfirst[2] += left_size;
+        pfirst[3] += left_size;
+        pfirst[4] += left_size;
+        pfirst[5] += left_size;
+        pfirst[6] += left_size;
+        pfirst[7] += left_size;
+
+        bsy >>= 3;
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[4] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[5] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[6] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[7] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+        }
+    } else if (bsy == 8) {
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+        pel8_t *dst5 = dst4 + i_dst;
+        pel8_t *dst6 = dst5 + i_dst;
+        pel8_t *dst7 = dst6 + i_dst;
+        pel8_t *dst8 = dst7 + i_dst;
+        for (i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel8_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
+            dst2[i] = (pel8_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
+            dst3[i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            dst4[i] = (pel8_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
+
+            dst5[i] = (pel8_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
+            dst6[i] = (pel8_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
+            dst7[i] = (pel8_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
+            dst8[i] = (pel8_t)((src[1] + 2 * src[0] + src[-1]  + 2) >> 2);
+        }
+    } else {
+        for (i = 0; i < bsx; i++, src++) {
+            pel8_t *dst1 = dst;
+            pel8_t *dst2 = dst1 + i_dst;
+            pel8_t *dst3 = dst2 + i_dst;
+            pel8_t *dst4 = dst3 + i_dst;
+            dst1[i] = (pel8_t)((7 * src[2] + 15 * src[1] +  9 * src[0] +     src[-1] + 16) >> 5);
+            dst2[i] = (pel8_t)((3 * src[2] +  7 * src[1] +  5 * src[0] +     src[-1] + 8) >> 4);
+            dst3[i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            dst4[i] = (pel8_t)((    src[2] +  3 * src[1] +  3 * src[0] +     src[-1] + 4) >> 3);
+        }
+    }
+}
+
+static void intra_pred_ang10_xy_13_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     if (bsy > 8) {
-        ALIGN16(pel_t first_line[(64 + 16) << 3]);
+        ALIGN16(pel10_t first_line[(64 + 16) << 3]);
         int line_size = bsx + (bsy >> 3) - 1;
         int left_size = line_size - bsx;
         int aligned_line_size = ((line_size + 15) >> 4) << 4;
-        pel_t *pfirst[8];
+        pel10_t *pfirst[8];
 
         pfirst[0] = first_line;
         pfirst[1] = pfirst[0] + aligned_line_size;
@@ -1236,27 +2361,27 @@ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
         src -= bsy - 8;
         for (i = 0; i < left_size; i++, src += 8) {
-            pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2);
-            pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2);
-            pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2);
-            pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2);
+            pfirst[0][i] = (pel10_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2);
+            pfirst[1][i] = (pel10_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2);
+            pfirst[2][i] = (pel10_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2);
+            pfirst[3][i] = (pel10_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2);
 
-            pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2);
-            pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
-            pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2);
-            pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+            pfirst[4][i] = (pel10_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2);
+            pfirst[5][i] = (pel10_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2);
+            pfirst[6][i] = (pel10_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2);
+            pfirst[7][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
         }
 
         for (; i < line_size; i++, src++) {
-            pfirst[0][i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
-            pfirst[1][i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
-            pfirst[2][i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
-            pfirst[3][i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
+            pfirst[0][i] = (pel10_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
+            pfirst[1][i] = (pel10_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
+            pfirst[2][i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            pfirst[3][i] = (pel10_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
 
-            pfirst[4][i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
-            pfirst[5][i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
-            pfirst[6][i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
-            pfirst[7][i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2);
+            pfirst[4][i] = (pel10_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
+            pfirst[5][i] = (pel10_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
+            pfirst[6][i] = (pel10_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
+            pfirst[7][i] = (pel10_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2);
         }
 
         pfirst[0] += left_size;
@@ -1270,66 +2395,131 @@ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
         bsy >>= 3;
         for (i = 0; i < bsy; i++) {
-            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[4] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[5] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[6] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[7] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
         }
     } else if (bsy == 8) {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
-        pel_t *dst5 = dst4 + i_dst;
-        pel_t *dst6 = dst5 + i_dst;
-        pel_t *dst7 = dst6 + i_dst;
-        pel_t *dst8 = dst7 + i_dst;
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
+        pel10_t *dst5 = dst4 + i_dst;
+        pel10_t *dst6 = dst5 + i_dst;
+        pel10_t *dst7 = dst6 + i_dst;
+        pel10_t *dst8 = dst7 + i_dst;
+        for (i = 0; i < bsx; i++, src++) {
+            dst1[i] = (pel10_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
+            dst2[i] = (pel10_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
+            dst3[i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            dst4[i] = (pel10_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
+
+            dst5[i] = (pel10_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
+            dst6[i] = (pel10_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
+            dst7[i] = (pel10_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
+            dst8[i] = (pel10_t)((src[1] + 2 * src[0] + src[-1]  + 2) >> 2);
+        }
+    } else {
         for (i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5);
-            dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4);
-            dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
-            dst4[i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3);
+            pel10_t *dst1 = dst;
+            pel10_t *dst2 = dst1 + i_dst;
+            pel10_t *dst3 = dst2 + i_dst;
+            pel10_t *dst4 = dst3 + i_dst;
+            dst1[i] = (pel10_t)((7 * src[2] + 15 * src[1] +  9 * src[0] +     src[-1] + 16) >> 5);
+            dst2[i] = (pel10_t)((3 * src[2] +  7 * src[1] +  5 * src[0] +     src[-1] + 8) >> 4);
+            dst3[i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
+            dst4[i] = (pel10_t)((    src[2] +  3 * src[1] +  3 * src[0] +     src[-1] + 4) >> 3);
+        }
+    }
+}
+
+/* ---------------------------------------------------------------------------
+ */
+static void intra_pred_ang8_xy_14_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+
+    if (bsy != 4) {
+        ALIGN16(pel8_t first_line[4 * (64 + 16)]);
+        int line_size = bsx + (bsy >> 2) - 1;
+        int left_size = line_size - bsx;
+        int aligned_line_size = ((line_size + 15) >> 4) << 4;
+        pel8_t *pfirst[4];
+
+        pfirst[0] = first_line;
+        pfirst[1] = first_line + aligned_line_size;
+        pfirst[2] = first_line + aligned_line_size * 2;
+        pfirst[3] = first_line + aligned_line_size * 3;
+
+        src -= bsy - 4;
+        for (i = 0; i < left_size; i++, src += 4) {
+            pfirst[0][i] = (pel8_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2);
+            pfirst[1][i] = (pel8_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2);
+            pfirst[2][i] = (pel8_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
+            pfirst[3][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        }
+
+        for (; i < line_size; i++, src++) {
+            pfirst[0][i] = (pel8_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            pfirst[1][i] = (pel8_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            pfirst[2][i] = (pel8_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            pfirst[3][i] = (pel8_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
+        }
+
+        pfirst[0] += left_size;
+        pfirst[1] += left_size;
+        pfirst[2] += left_size;
+        pfirst[3] += left_size;
 
-            dst5[i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5);
-            dst6[i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4);
-            dst7[i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5);
-            dst8[i] = (pel_t)((src[1] + 2 * src[0] + src[-1]  + 2) >> 2);
+        bsy >>= 2;
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel8_t));
+            dst += i_dst;
         }
     } else {
+        pel8_t *dst1 = dst;
+        pel8_t *dst2 = dst1 + i_dst;
+        pel8_t *dst3 = dst2 + i_dst;
+        pel8_t *dst4 = dst3 + i_dst;
+
         for (i = 0; i < bsx; i++, src++) {
-            pel_t *dst1 = dst;
-            pel_t *dst2 = dst1 + i_dst;
-            pel_t *dst3 = dst2 + i_dst;
-            pel_t *dst4 = dst3 + i_dst;
-            dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] +  9 * src[0] +     src[-1] + 16) >> 5);
-            dst2[i] = (pel_t)((3 * src[2] +  7 * src[1] +  5 * src[0] +     src[-1] + 8) >> 4);
-            dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5);
-            dst4[i] = (pel_t)((    src[2] +  3 * src[1] +  3 * src[0] +     src[-1] + 4) >> 3);
+            dst1[i] = (pel8_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            dst2[i] = (pel8_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            dst3[i] = (pel8_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            dst4[i] = (pel8_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
         }
     }
 }
-static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+
+static void intra_pred_ang10_xy_14_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (bsy != 4) {
-        ALIGN16(pel_t first_line[4 * (64 + 16)]);
+        ALIGN16(pel10_t first_line[4 * (64 + 16)]);
         int line_size = bsx + (bsy >> 2) - 1;
         int left_size = line_size - bsx;
         int aligned_line_size = ((line_size + 15) >> 4) << 4;
-        pel_t *pfirst[4];
+        pel10_t *pfirst[4];
 
         pfirst[0] = first_line;
         pfirst[1] = first_line + aligned_line_size;
@@ -1338,17 +2528,17 @@ static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
         src -= bsy - 4;
         for (i = 0; i < left_size; i++, src += 4) {
-            pfirst[0][i] = (pel_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2);
-            pfirst[1][i] = (pel_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2);
-            pfirst[2][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
-            pfirst[3][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+            pfirst[0][i] = (pel10_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2);
+            pfirst[1][i] = (pel10_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2);
+            pfirst[2][i] = (pel10_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
+            pfirst[3][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
         }
 
         for (; i < line_size; i++, src++) {
-            pfirst[0][i] = (pel_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
-            pfirst[1][i] = (pel_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
-            pfirst[2][i] = (pel_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
-            pfirst[3][i] = (pel_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
+            pfirst[0][i] = (pel10_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            pfirst[1][i] = (pel10_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            pfirst[2][i] = (pel10_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            pfirst[3][i] = (pel10_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
         }
 
         pfirst[0] += left_size;
@@ -1358,40 +2548,75 @@ static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
         bsy >>= 2;
         for (i = 0; i < bsy; i++) {
-            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[0] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[1] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[2] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
-            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst[3] - i, bsx * sizeof(pel10_t));
             dst += i_dst;
         }
     } else {
-        pel_t *dst1 = dst;
-        pel_t *dst2 = dst1 + i_dst;
-        pel_t *dst3 = dst2 + i_dst;
-        pel_t *dst4 = dst3 + i_dst;
+        pel10_t *dst1 = dst;
+        pel10_t *dst2 = dst1 + i_dst;
+        pel10_t *dst3 = dst2 + i_dst;
+        pel10_t *dst4 = dst3 + i_dst;
 
         for (i = 0; i < bsx; i++, src++) {
-            dst1[i] = (pel_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
-            dst2[i] = (pel_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
-            dst3[i] = (pel_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
-            dst4[i] = (pel_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
+            dst1[i] = (pel10_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            dst2[i] = (pel10_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            dst3[i] = (pel10_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            dst4[i] = (pel10_t)((src[-1]     +  src[0] * 2 + src[1]                   + 2) >> 2);
         }
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_16_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[2 * (64 + 32)]);
+    int line_size = bsx + (bsy >> 1) - 1;
+    int left_size = line_size - bsx;
+    int aligned_line_size = ((line_size + 15) >> 4) << 4;
+    int i_dst2 = i_dst << 1;
+    pel8_t *pfirst[2];
+    int i;
+
+    pfirst[0] = first_line;
+    pfirst[1] = first_line + aligned_line_size;
+
+    src -= bsy - 2;
+    for (i = 0; i < left_size; i++, src += 2) {
+        pfirst[0][i] = (pel8_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
+        pfirst[1][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+    }
+
+    for (; i < line_size; i++, src++) {
+        pfirst[0][i] = (pel8_t)((src[-1] + (src[0]       + src[1]) * 3 + src[2] + 4) >> 3);
+        pfirst[1][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1]               + 2) >> 2);
+    }
+
+    pfirst[0] += left_size;
+    pfirst[1] += left_size;
+
+    bsy >>= 1;
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst        , pfirst[0] - i, bsx * sizeof(pel8_t));
+        memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel8_t));
+        dst += i_dst2;
+    }
+}
+
+static void intra_pred_ang10_xy_16_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[2 * (64 + 32)]);
+    ALIGN16(pel10_t first_line[2 * (64 + 32)]);
     int line_size = bsx + (bsy >> 1) - 1;
     int left_size = line_size - bsx;
     int aligned_line_size = ((line_size + 15) >> 4) << 4;
     int i_dst2 = i_dst << 1;
-    pel_t *pfirst[2];
+    pel10_t *pfirst[2];
     int i;
 
     pfirst[0] = first_line;
@@ -1399,13 +2624,13 @@ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
     src -= bsy - 2;
     for (i = 0; i < left_size; i++, src += 2) {
-        pfirst[0][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
-        pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        pfirst[0][i] = (pel10_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2);
+        pfirst[1][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
     }
 
     for (; i < line_size; i++, src++) {
-        pfirst[0][i] = (pel_t)((src[-1] + (src[0]       + src[1]) * 3 + src[2] + 4) >> 3);
-        pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1]               + 2) >> 2);
+        pfirst[0][i] = (pel10_t)((src[-1] + (src[0]       + src[1]) * 3 + src[2] + 4) >> 3);
+        pfirst[1][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1]               + 2) >> 2);
     }
 
     pfirst[0] += left_size;
@@ -1413,28 +2638,47 @@ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
     bsy >>= 1;
     for (i = 0; i < bsy; i++) {
-        memcpy(dst        , pfirst[0] - i, bsx * sizeof(pel_t));
-        memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t));
+        memcpy(dst        , pfirst[0] - i, bsx * sizeof(pel10_t));
+        memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel10_t));
         dst += i_dst2;
     }
 }
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_18_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[64 + 64]);
+    int line_size = bsx + bsy - 1;
+    int i;
+    pel8_t *pfirst = first_line + bsy - 1;
+
+    src -= bsy - 1;
+    for (i = 0; i < line_size; i++, src++) {
+        first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, pfirst, bsx * sizeof(pel8_t));
+        pfirst--;
+        dst += i_dst;
+    }
+}
+
+static void intra_pred_ang10_xy_18_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[64 + 64]);
+    ALIGN16(pel10_t first_line[64 + 64]);
     int line_size = bsx + bsy - 1;
     int i;
-    pel_t *pfirst = first_line + bsy - 1;
+    pel10_t *pfirst = first_line + bsy - 1;
 
     src -= bsy - 1;
     for (i = 0; i < line_size; i++, src++) {
-        first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
     }
 
     for (i = 0; i < bsy; i++) {
-        memcpy(dst, pfirst, bsx * sizeof(pel_t));
+        memcpy(dst, pfirst, bsx * sizeof(pel10_t));
         pfirst--;
         dst += i_dst;
     }
@@ -1442,28 +2686,55 @@ static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_20_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    ALIGN16(pel8_t first_line[64 + 128]);
+    int left_size = ((bsy - 1) << 1) + 1;
+    int top_size = bsx - 1;
+    int line_size = left_size + top_size;
+    int i;
+    pel8_t *pfirst = first_line + left_size - 1;
+
+    src -= bsy;
+    for (i = 0; i < left_size; i += 2, src++) {
+        first_line[i    ] = (pel8_t)((src[-1] + (src[0] +  src[1]) * 3  + src[2] + 4) >> 3);
+        first_line[i + 1] = (pel8_t)((           src[0] + (src[1] << 1) + src[2] + 2) >> 2);
+    }
+    i--;
+
+    for (; i < line_size; i++, src++) {
+        first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+    }
+
+    for (i = 0; i < bsy; i++) {
+        memcpy(dst, pfirst, bsx * sizeof(pel8_t));
+        pfirst -= 2;
+        dst    += i_dst;
+    }
+}
+
+static void intra_pred_ang10_xy_20_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
-    ALIGN16(pel_t first_line[64 + 128]);
+    ALIGN16(pel10_t first_line[64 + 128]);
     int left_size = ((bsy - 1) << 1) + 1;
     int top_size = bsx - 1;
     int line_size = left_size + top_size;
     int i;
-    pel_t *pfirst = first_line + left_size - 1;
+    pel10_t *pfirst = first_line + left_size - 1;
 
     src -= bsy;
     for (i = 0; i < left_size; i += 2, src++) {
-        first_line[i    ] = (pel_t)((src[-1] + (src[0] +  src[1]) * 3  + src[2] + 4) >> 3);
-        first_line[i + 1] = (pel_t)((           src[0] + (src[1] << 1) + src[2] + 2) >> 2);
+        first_line[i    ] = (pel10_t)((src[-1] + (src[0] +  src[1]) * 3  + src[2] + 4) >> 3);
+        first_line[i + 1] = (pel10_t)((           src[0] + (src[1] << 1) + src[2] + 2) >> 2);
     }
     i--;
 
     for (; i < line_size; i++, src++) {
-        first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
     }
 
     for (i = 0; i < bsy; i++) {
-        memcpy(dst, pfirst, bsx * sizeof(pel_t));
+        memcpy(dst, pfirst, bsx * sizeof(pel10_t));
         pfirst -= 2;
         dst    += i_dst;
     }
@@ -1471,41 +2742,82 @@ static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
 /* ---------------------------------------------------------------------------
  */
-static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_22_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+
+    if (bsx != 4) {
+        src -= bsy;
+        ALIGN16(pel8_t first_line[64 + 256]);
+        int left_size = ((bsy - 1) << 2) + 3;
+        int top_size  = bsx - 3;
+        int line_size = left_size + top_size;
+        pel8_t *pfirst = first_line + left_size - 3;
+
+        for (i = 0; i < left_size; i += 4, src++) {
+            first_line[i    ] = (pel8_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            first_line[i + 1] = (pel8_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            first_line[i + 2] = (pel8_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            first_line[i + 3] = (pel8_t)((               src[0]     + src[1]  * 2 + src[2]     + 2) >> 2);
+        }
+        i--;
+
+        for (; i < line_size; i++, src++) {
+            first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+        }
+
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst, pfirst, bsx * sizeof(pel8_t));
+            dst    += i_dst;
+            pfirst -= 4;
+        }
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((src[-2] * 3 +  src[-1] * 7 + src[0]  * 5 + src[1]     + 8) >> 4);
+            dst[1] = (pel8_t)((src[-2]     + (src[-1]     + src[0]) * 3 + src[1]     + 4) >> 3);
+            dst[2] = (pel8_t)((src[-2]     +  src[-1] * 5 + src[0]  * 7 + src[1] * 3 + 8) >> 4);
+            dst[3] = (pel8_t)((               src[-1]     + src[0]  * 2 + src[1]     + 2) >> 2);
+            dst += i_dst;
+        }
+        // needn't pad, (3,0) is equal for ang_x and ang_y
+    }
+}
+
+static void intra_pred_ang10_xy_22_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (bsx != 4) {
         src -= bsy;
-        ALIGN16(pel_t first_line[64 + 256]);
+        ALIGN16(pel10_t first_line[64 + 256]);
         int left_size = ((bsy - 1) << 2) + 3;
         int top_size  = bsx - 3;
         int line_size = left_size + top_size;
-        pel_t *pfirst = first_line + left_size - 3;
+        pel10_t *pfirst = first_line + left_size - 3;
 
         for (i = 0; i < left_size; i += 4, src++) {
-            first_line[i    ] = (pel_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
-            first_line[i + 1] = (pel_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
-            first_line[i + 2] = (pel_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
-            first_line[i + 3] = (pel_t)((               src[0]     + src[1]  * 2 + src[2]     + 2) >> 2);
+            first_line[i    ] = (pel10_t)((src[-1] * 3 +  src[0] * 7 + src[1]  * 5 + src[2]     + 8) >> 4);
+            first_line[i + 1] = (pel10_t)((src[-1]     + (src[0]     + src[1]) * 3 + src[2]     + 4) >> 3);
+            first_line[i + 2] = (pel10_t)((src[-1]     +  src[0] * 5 + src[1]  * 7 + src[2] * 3 + 8) >> 4);
+            first_line[i + 3] = (pel10_t)((               src[0]     + src[1]  * 2 + src[2]     + 2) >> 2);
         }
         i--;
 
         for (; i < line_size; i++, src++) {
-            first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
+            first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2);
         }
 
         for (i = 0; i < bsy; i++) {
-            memcpy(dst, pfirst, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst, bsx * sizeof(pel10_t));
             dst    += i_dst;
             pfirst -= 4;
         }
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((src[-2] * 3 +  src[-1] * 7 + src[0]  * 5 + src[1]     + 8) >> 4);
-            dst[1] = (pel_t)((src[-2]     + (src[-1]     + src[0]) * 3 + src[1]     + 4) >> 3);
-            dst[2] = (pel_t)((src[-2]     +  src[-1] * 5 + src[0]  * 7 + src[1] * 3 + 8) >> 4);
-            dst[3] = (pel_t)((               src[-1]     + src[0]  * 2 + src[1]     + 2) >> 2);
+            dst[0] = (pel10_t)((src[-2] * 3 +  src[-1] * 7 + src[0]  * 5 + src[1]     + 8) >> 4);
+            dst[1] = (pel10_t)((src[-2]     + (src[-1]     + src[0]) * 3 + src[1]     + 4) >> 3);
+            dst[2] = (pel10_t)((src[-2]     +  src[-1] * 5 + src[0]  * 7 + src[1] * 3 + 8) >> 4);
+            dst[3] = (pel10_t)((               src[-1]     + src[0]  * 2 + src[1]     + 2) >> 2);
             dst += i_dst;
         }
         // needn't pad, (3,0) is equal for ang_x and ang_y
@@ -1514,60 +2826,119 @@ static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
 /* ---------------------------------------------------------------------------
 */
-static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+static void intra_pred_ang8_xy_23_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+{
+    int i;
+
+    if (bsx > 8) {
+        ALIGN16(pel8_t first_line[64 + 512]);
+        int left_size = (bsy << 3) - 1;
+        int top_size = bsx - 7;
+        int line_size = left_size + top_size;
+        pel8_t *pfirst = first_line + left_size - 7;
+
+        src -= bsy;
+        for (i = 0; i < left_size; i += 8, src++) {
+            first_line[i    ] = (pel8_t)((7 * src[-1] + 15 * src[0] +  9 * src[1] +     src[2] + 16) >> 5);
+            first_line[i + 1] = (pel8_t)((3 * src[-1] +  7 * src[0] +  5 * src[1] +     src[2] +  8) >> 4);
+            first_line[i + 2] = (pel8_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5);
+            first_line[i + 3] = (pel8_t)((    src[-1] +  3 * src[0] +  3 * src[1] +     src[2] +  4) >> 3);
+
+            first_line[i + 4] = (pel8_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5);
+            first_line[i + 5] = (pel8_t)((    src[-1] +  5 * src[0] +  7 * src[1] + 3 * src[2] +  8) >> 4);
+            first_line[i + 6] = (pel8_t)((    src[-1] +  9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
+            first_line[i + 7] = (pel8_t)((    src[ 0] +  2 * src[1] +      src[2] + 0 * src[3] +  2) >> 2);
+        }
+        i--;
+
+        for (; i < line_size; i++, src++) {
+            first_line[i] = (pel8_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2);
+        }
+
+        for (i = 0; i < bsy; i++) {
+            memcpy(dst, pfirst, bsx * sizeof(pel8_t));
+            dst += i_dst;
+            pfirst -= 8;
+        }
+    } else if (bsx == 8) {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((7 * src[-2] + 15 * src[-1] +  9 * src[0] +     src[1] + 16) >> 5);
+            dst[1] = (pel8_t)((3 * src[-2] +  7 * src[-1] +  5 * src[0] +     src[1] +  8) >> 4);
+            dst[2] = (pel8_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
+            dst[3] = (pel8_t)((    src[-2] +  3 * src[-1] +  3 * src[0] +     src[1] +  4) >> 3);
+
+            dst[4] = (pel8_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5);
+            dst[5] = (pel8_t)((    src[-2] +  5 * src[-1] +  7 * src[0] + 3 * src[1] +  8) >> 4);
+            dst[6] = (pel8_t)((    src[-2] +  9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
+            dst[7] = (pel8_t)((    src[-1] +  2 * src[ 0] +      src[1] + 0 * src[2] +  2) >> 2);
+            dst += i_dst;
+        }
+        // needn't pad, (7,0) is equal for ang_x and ang_y
+    } else {
+        for (i = 0; i < bsy; i++, src--) {
+            dst[0] = (pel8_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5);
+            dst[1] = (pel8_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4);
+            dst[2] = (pel8_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
+            dst[3] = (pel8_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3);
+            dst += i_dst;
+        }
+    }
+}
+
+static void intra_pred_ang10_xy_23_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
     if (bsx > 8) {
-        ALIGN16(pel_t first_line[64 + 512]);
+        ALIGN16(pel10_t first_line[64 + 512]);
         int left_size = (bsy << 3) - 1;
         int top_size = bsx - 7;
         int line_size = left_size + top_size;
-        pel_t *pfirst = first_line + left_size - 7;
+        pel10_t *pfirst = first_line + left_size - 7;
 
         src -= bsy;
         for (i = 0; i < left_size; i += 8, src++) {
-            first_line[i    ] = (pel_t)((7 * src[-1] + 15 * src[0] +  9 * src[1] +     src[2] + 16) >> 5);
-            first_line[i + 1] = (pel_t)((3 * src[-1] +  7 * src[0] +  5 * src[1] +     src[2] +  8) >> 4);
-            first_line[i + 2] = (pel_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5);
-            first_line[i + 3] = (pel_t)((    src[-1] +  3 * src[0] +  3 * src[1] +     src[2] +  4) >> 3);
+            first_line[i    ] = (pel10_t)((7 * src[-1] + 15 * src[0] +  9 * src[1] +     src[2] + 16) >> 5);
+            first_line[i + 1] = (pel10_t)((3 * src[-1] +  7 * src[0] +  5 * src[1] +     src[2] +  8) >> 4);
+            first_line[i + 2] = (pel10_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5);
+            first_line[i + 3] = (pel10_t)((    src[-1] +  3 * src[0] +  3 * src[1] +     src[2] +  4) >> 3);
 
-            first_line[i + 4] = (pel_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5);
-            first_line[i + 5] = (pel_t)((    src[-1] +  5 * src[0] +  7 * src[1] + 3 * src[2] +  8) >> 4);
-            first_line[i + 6] = (pel_t)((    src[-1] +  9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
-            first_line[i + 7] = (pel_t)((    src[ 0] +  2 * src[1] +      src[2] + 0 * src[3] +  2) >> 2);
+            first_line[i + 4] = (pel10_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5);
+            first_line[i + 5] = (pel10_t)((    src[-1] +  5 * src[0] +  7 * src[1] + 3 * src[2] +  8) >> 4);
+            first_line[i + 6] = (pel10_t)((    src[-1] +  9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5);
+            first_line[i + 7] = (pel10_t)((    src[ 0] +  2 * src[1] +      src[2] + 0 * src[3] +  2) >> 2);
         }
         i--;
 
         for (; i < line_size; i++, src++) {
-            first_line[i] = (pel_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2);
+            first_line[i] = (pel10_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2);
         }
 
         for (i = 0; i < bsy; i++) {
-            memcpy(dst, pfirst, bsx * sizeof(pel_t));
+            memcpy(dst, pfirst, bsx * sizeof(pel10_t));
             dst += i_dst;
             pfirst -= 8;
         }
     } else if (bsx == 8) {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] +  9 * src[0] +     src[1] + 16) >> 5);
-            dst[1] = (pel_t)((3 * src[-2] +  7 * src[-1] +  5 * src[0] +     src[1] +  8) >> 4);
-            dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
-            dst[3] = (pel_t)((    src[-2] +  3 * src[-1] +  3 * src[0] +     src[1] +  4) >> 3);
-
-            dst[4] = (pel_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5);
-            dst[5] = (pel_t)((    src[-2] +  5 * src[-1] +  7 * src[0] + 3 * src[1] +  8) >> 4);
-            dst[6] = (pel_t)((    src[-2] +  9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
-            dst[7] = (pel_t)((    src[-1] +  2 * src[ 0] +      src[1] + 0 * src[2] +  2) >> 2);
+            dst[0] = (pel10_t)((7 * src[-2] + 15 * src[-1] +  9 * src[0] +     src[1] + 16) >> 5);
+            dst[1] = (pel10_t)((3 * src[-2] +  7 * src[-1] +  5 * src[0] +     src[1] +  8) >> 4);
+            dst[2] = (pel10_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
+            dst[3] = (pel10_t)((    src[-2] +  3 * src[-1] +  3 * src[0] +     src[1] +  4) >> 3);
+
+            dst[4] = (pel10_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5);
+            dst[5] = (pel10_t)((    src[-2] +  5 * src[-1] +  7 * src[0] + 3 * src[1] +  8) >> 4);
+            dst[6] = (pel10_t)((    src[-2] +  9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5);
+            dst[7] = (pel10_t)((    src[-1] +  2 * src[ 0] +      src[1] + 0 * src[2] +  2) >> 2);
             dst += i_dst;
         }
         // needn't pad, (7,0) is equal for ang_x and ang_y
     } else {
         for (i = 0; i < bsy; i++, src--) {
-            dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5);
-            dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4);
-            dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
-            dst[3] = (pel_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3);
+            dst[0] = (pel10_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5);
+            dst[1] = (pel10_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4);
+            dst[2] = (pel10_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5);
+            dst[3] = (pel10_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3);
             dst += i_dst;
         }
     }
@@ -1575,15 +2946,76 @@ static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚÉÏ±ß½çµÄPU
+ * LCUå†…åœ¨ä¸Šè¾¹ç•Œçš„PU
  */
 static
-void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_reference_samples8_0_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy)
+{
+    int num_padding = 0;
+
+    /* fill default value */
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
+
+    /* get prediction pixels ---------------------------------------
+     * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
+     * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 |     0    | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4
+     */
+
+    /* fill top & top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        /* fill top pixels */
+        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel8_t));
+    }
+
+    /* fill top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
+    }
+
+    /* fill extra pixels */
+    num_padding = bsy * 11 / 4 - bsx + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3
+    }
+
+    /* fill left & left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        /* fill left pixels */
+        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel8_t));
+    }
+
+    /* fill left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
+        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
+    }
+
+    /* fill top-left pixel */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) {
+        EP[0] = pLcuEP[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        EP[0] = pLcuEP[1];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        EP[0] = pLcuEP[-1];
+    }
+
+    /* fill extra pixels */
+    num_padding = bsx * 11 / 4 - bsy + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3
+    }
+}
+
+static
+void fill_reference_samples10_0_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
     int num_padding = 0;
 
     /* fill default value */
-    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1);
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -1593,12 +3025,12 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill top & top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
         /* fill top pixels */
-        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel10_t));
     }
 
     /* fill top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
-        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
     }
@@ -1612,12 +3044,12 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill left & left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
         /* fill left pixels */
-        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t));
+        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel10_t));
     }
 
     /* fill left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
-        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t));
+        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
     }
@@ -1640,16 +3072,89 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚÉÏ±ß½çµÄPU
+ * LCUå†…åœ¨ä¸Šè¾¹ç•Œçš„PU
  */
 static
-void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_reference_samples8_x_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy)
+{
+    const pel8_t *pL = pTL + i_TL;
+    int num_padding = 0;
+
+    /* fill default value */
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
+
+    /* get prediction pixels ---------------------------------------
+     * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
+     * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 |     0    | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4
+     */
+
+    /* fill top & top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        /* fill top pixels */
+        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel8_t));
+    }
+
+    /* fill top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
+    }
+
+    /* fill extra pixels */
+    num_padding = bsy * 11 / 4 - bsx + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3
+    }
+
+    /* fill left & left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        const pel8_t *p_l = pL;
+        int y;
+        /* fill left pixels */
+        for (y = 0; y < bsy; y++) {
+            EP[-1 - y] = *p_l;
+            p_l += i_TL;
+        }
+    }
+
+    /* fill left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
+        int y;
+        const pel8_t *p_l = pL + bsy * i_TL;
+
+        for (y = 0; y < bsy; y++) {
+            EP[-bsy - 1 - y] = *p_l;
+            p_l += i_TL;
+        }
+    } else {
+        g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
+    }
+
+    /* fill top-left pixel */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) {
+        EP[0] = pLcuEP[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        EP[0] = pLcuEP[1];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        EP[0] = pL[0];
+    }
+
+    /* fill extra pixels */
+    num_padding = bsx * 11 / 4 - bsy + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3
+    }
+}
+
+static
+void fill_reference_samples10_x_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
-    const pel_t *pL = pTL + i_TL;
+    const pel10_t *pL = pTL + i_TL;
     int num_padding = 0;
 
     /* fill default value */
-    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1);
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -1659,12 +3164,12 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill top & top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
         /* fill top pixels */
-        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel10_t));
     }
 
     /* fill top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
-        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
     }
@@ -1677,7 +3182,7 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
     /* fill left & left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
-        const pel_t *p_l = pL;
+        const pel10_t *p_l = pL;
         int y;
         /* fill left pixels */
         for (y = 0; y < bsy; y++) {
@@ -1689,7 +3194,7 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
         int y;
-        const pel_t *p_l = pL + bsy * i_TL;
+        const pel10_t *p_l = pL + bsy * i_TL;
 
         for (y = 0; y < bsy; y++) {
             EP[-bsy - 1 - y] = *p_l;
@@ -1717,16 +3222,78 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚ×ó±ß½çÉÏµÄPU
+ * LCUå†…åœ¨å·¦è¾¹ç•Œä¸Šçš„PU
  */
 static
-void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_reference_samples8_y_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy)
+{
+    const pel8_t *pT = pTL + 1;
+    int num_padding = 0;
+
+    /* fill default value */
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
+
+    /* get prediction pixels ---------------------------------------
+     * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
+     * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 |     0    | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4
+     */
+
+    /* fill top & top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        /* fill top pixels */
+        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel8_t));
+    }
+
+    /* fill top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
+    }
+
+    /* fill extra pixels */
+    num_padding = bsy * 11 / 4 - bsx + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3
+    }
+
+    /* fill left & left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        /* fill left pixels */
+        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel8_t));
+    }
+
+    /* fill left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
+        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
+    }
+
+    /* fill top-left pixel */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) {
+        EP[0] = pLcuEP[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        EP[0] = pT[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        EP[0] = pLcuEP[-1];
+    }
+
+    /* fill extra pixels */
+    num_padding = bsx * 11 / 4 - bsy + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3
+    }
+}
+
+static
+void fill_reference_samples10_y_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
-    const pel_t *pT = pTL + 1;
+    const pel10_t *pT = pTL + 1;
     int num_padding = 0;
 
     /* fill default value */
-    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1);
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -1736,12 +3303,12 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill top & top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
         /* fill top pixels */
-        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel10_t));
     }
 
     /* fill top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
-        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
     }
@@ -1755,12 +3322,12 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill left & left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
         /* fill left pixels */
-        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t));
+        memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel10_t));
     }
 
     /* fill left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
-        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t));
+        memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
     }
@@ -1783,17 +3350,91 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚ²»ÔÚ±ß½çÉÏµÄPU
+ * LCUå†…ä¸åœ¨è¾¹ç•Œä¸Šçš„PU
  */
 static
-void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_reference_samples8_xy_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy)
+{
+    const pel8_t *pT = pTL + 1;
+    const pel8_t *pL = pTL + i_TL;
+    int num_padding = 0;
+
+    /* fill default value */
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
+
+    /* get prediction pixels ---------------------------------------
+     * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
+     * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 |     0    | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4
+     */
+
+    /* fill top & top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        /* fill top pixels */
+        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel8_t));
+    }
+
+    /* fill top-right pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel8_t));
+    } else {
+        g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
+    }
+
+    /* fill extra pixels */
+    num_padding = bsy * 11 / 4 - bsx + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3
+    }
+
+    /* fill left & left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        const pel8_t *p_l = pL;
+        int y;
+        /* fill left pixels */
+        for (y = 0; y < bsy; y++) {
+            EP[-1 - y] = *p_l;
+            p_l += i_TL;
+        }
+    }
+
+    /* fill left-down pixels */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
+        int y;
+        const pel8_t *p_l = pL + bsy * i_TL;
+
+        for (y = 0; y < bsy; y++) {
+            EP[-bsy - 1 - y] = *p_l;
+            p_l += i_TL;
+        }
+    } else {
+        g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy);
+    }
+
+    /* fill top-left pixel */
+    if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) {
+        EP[0] = pTL[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
+        EP[0] = pT[0];
+    } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
+        EP[0] = pL[0];
+    }
+
+    /* fill extra pixels */
+    num_padding = bsx * 11 / 4 - bsy + 4;
+    if (num_padding > 0) {
+        g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3
+    }
+}
+
+static
+void fill_reference_samples10_xy_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
-    const pel_t *pT = pTL + 1;
-    const pel_t *pL = pTL + i_TL;
+    const pel10_t *pT = pTL + 1;
+    const pel10_t *pL = pTL + i_TL;
     int num_padding = 0;
 
     /* fill default value */
-    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1);
+    g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -1803,12 +3444,12 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
     /* fill top & top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) {
         /* fill top pixels */
-        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel10_t));
     }
 
     /* fill top-right pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) {
-        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t));
+        g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel10_t));
     } else {
         g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx);   // repeat the last pixel
     }
@@ -1821,7 +3462,7 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
 
     /* fill left & left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) {
-        const pel_t *p_l = pL;
+        const pel10_t *p_l = pL;
         int y;
         /* fill left pixels */
         for (y = 0; y < bsy; y++) {
@@ -1833,7 +3474,7 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
     /* fill left-down pixels */
     if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) {
         int y;
-        const pel_t *p_l = pL + bsy * i_TL;
+        const pel10_t *p_l = pL + bsy * i_TL;
 
         for (y = 0; y < bsy; y++) {
             EP[-bsy - 1 - y] = *p_l;
@@ -1867,65 +3508,118 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf)
+void xavs2_intra_pred_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf)
 {
 #define ANG_X_OFFSET    3
 #define ANG_XY_OFFSET   13
 #define ANG_Y_OFFSET    25
     int i;
 
-    intra_pred_t *ipred = pf->intraf;
+    if (param->input_sample_bit_depth == 8) {
+    intra8_pred_t *ipred = pf->intraf8;
+
+    pf->fill_edge8_f[0] = fill_reference_samples8_0_c;
+    pf->fill_edge8_f[1] = fill_reference_samples8_x_c;
+    pf->fill_edge8_f[2] = fill_reference_samples8_y_c;
+    pf->fill_edge8_f[3] = fill_reference_samples8_xy_c;
+    ipred[DC_PRED   ] = intra_pred_dc8_c;                // 0
+    ipred[PLANE_PRED] = intra_pred_plane8_c;             // 1
+    ipred[BI_PRED   ] = intra_pred_bilinear8_c;          // 2
+
+    for (i = ANG_X_OFFSET; i < VERT_PRED; i++) {
+        ipred[i     ] = intra_pred_ang8_x_c;             // 3 ~ 11
+    }
+    ipred[VERT_PRED ] = intra_pred_ver8_c;               // 12
+
+    for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) {
+        ipred[i     ] = intra_pred_ang8_xy_c;            // 13 ~ 23
+    }
+
+    ipred[HOR_PRED  ] = intra_pred_hor8_c;               // 24
+    for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) {
+        ipred[i     ] = intra_pred_ang8_y_c;             // 25 ~ 32
+    }
+
+    ipred[INTRA_ANG_X_3 ]  = intra_pred_ang8_x_3_c;
+    ipred[INTRA_ANG_X_4 ]  = intra_pred_ang8_x_4_c;
+    ipred[INTRA_ANG_X_5 ]  = intra_pred_ang8_x_5_c;
+    ipred[INTRA_ANG_X_6 ]  = intra_pred_ang8_x_6_c;
+    ipred[INTRA_ANG_X_7 ]  = intra_pred_ang8_x_7_c;
+    ipred[INTRA_ANG_X_8 ]  = intra_pred_ang8_x_8_c;
+    ipred[INTRA_ANG_X_9 ]  = intra_pred_ang8_x_9_c;
+    ipred[INTRA_ANG_X_10]  = intra_pred_ang8_x_10_c;
+    ipred[INTRA_ANG_X_11]  = intra_pred_ang8_x_11_c;
+
+    ipred[INTRA_ANG_XY_13] = intra_pred_ang8_xy_13_c;
+    ipred[INTRA_ANG_XY_14] = intra_pred_ang8_xy_14_c;
+    ipred[INTRA_ANG_XY_16] = intra_pred_ang8_xy_16_c;
+    ipred[INTRA_ANG_XY_18] = intra_pred_ang8_xy_18_c;
+    ipred[INTRA_ANG_XY_20] = intra_pred_ang8_xy_20_c;
+    ipred[INTRA_ANG_XY_22] = intra_pred_ang8_xy_22_c;
+    ipred[INTRA_ANG_XY_23] = intra_pred_ang8_xy_23_c;
+
+    ipred[INTRA_ANG_Y_25]  = intra_pred_ang8_y_25_c;
+    ipred[INTRA_ANG_Y_26]  = intra_pred_ang8_y_26_c;
+    ipred[INTRA_ANG_Y_27]  = intra_pred_ang8_y_27_c;
+    ipred[INTRA_ANG_Y_28]  = intra_pred_ang8_y_28_c;
+    ipred[INTRA_ANG_Y_29]  = intra_pred_ang8_y_29_c;
+    ipred[INTRA_ANG_Y_30]  = intra_pred_ang8_y_30_c;
+    ipred[INTRA_ANG_Y_31]  = intra_pred_ang8_y_31_c;
+    ipred[INTRA_ANG_Y_32]  = intra_pred_ang8_y_32_c;
+    } else {
+    intra10_pred_t *ipred = pf->intraf10;
 
-    pf->fill_edge_f[0] = fill_reference_samples_0_c;
-    pf->fill_edge_f[1] = fill_reference_samples_x_c;
-    pf->fill_edge_f[2] = fill_reference_samples_y_c;
-    pf->fill_edge_f[3] = fill_reference_samples_xy_c;
-    ipred[DC_PRED   ] = intra_pred_dc_c;                // 0
-    ipred[PLANE_PRED] = intra_pred_plane_c;             // 1
-    ipred[BI_PRED   ] = intra_pred_bilinear_c;          // 2
+    pf->fill_edge10_f[0] = fill_reference_samples10_0_c;
+    pf->fill_edge10_f[1] = fill_reference_samples10_x_c;
+    pf->fill_edge10_f[2] = fill_reference_samples10_y_c;
+    pf->fill_edge10_f[3] = fill_reference_samples10_xy_c;
+    ipred[DC_PRED   ] = intra_pred_dc10_c;                // 0
+    ipred[PLANE_PRED] = intra_pred_plane10_c;             // 1
+    ipred[BI_PRED   ] = intra_pred_bilinear10_c;          // 2
 
     for (i = ANG_X_OFFSET; i < VERT_PRED; i++) {
-        ipred[i     ] = intra_pred_ang_x_c;             // 3 ~ 11
+        ipred[i     ] = intra_pred_ang10_x_c;             // 3 ~ 11
     }
-    ipred[VERT_PRED ] = intra_pred_ver_c;               // 12
+    ipred[VERT_PRED ] = intra_pred_ver10_c;               // 12
 
     for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) {
-        ipred[i     ] = intra_pred_ang_xy_c;            // 13 ~ 23
+        ipred[i     ] = intra_pred_ang10_xy_c;            // 13 ~ 23
     }
 
-    ipred[HOR_PRED  ] = intra_pred_hor_c;               // 24
+    ipred[HOR_PRED  ] = intra_pred_hor10_c;               // 24
     for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) {
-        ipred[i     ] = intra_pred_ang_y_c;             // 25 ~ 32
-    }
-
-    ipred[INTRA_ANG_X_3 ]  = intra_pred_ang_x_3_c;
-    ipred[INTRA_ANG_X_4 ]  = intra_pred_ang_x_4_c;
-    ipred[INTRA_ANG_X_5 ]  = intra_pred_ang_x_5_c;
-    ipred[INTRA_ANG_X_6 ]  = intra_pred_ang_x_6_c;
-    ipred[INTRA_ANG_X_7 ]  = intra_pred_ang_x_7_c;
-    ipred[INTRA_ANG_X_8 ]  = intra_pred_ang_x_8_c;
-    ipred[INTRA_ANG_X_9 ]  = intra_pred_ang_x_9_c;
-    ipred[INTRA_ANG_X_10]  = intra_pred_ang_x_10_c;
-    ipred[INTRA_ANG_X_11]  = intra_pred_ang_x_11_c;
-
-    ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_c;
-    ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_c;
-    ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_c;
-    ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_c;
-    ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_c;
-    ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_c;
-    ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_c;
-
-    ipred[INTRA_ANG_Y_25]  = intra_pred_ang_y_25_c;
-    ipred[INTRA_ANG_Y_26]  = intra_pred_ang_y_26_c;
-    ipred[INTRA_ANG_Y_27]  = intra_pred_ang_y_27_c;
-    ipred[INTRA_ANG_Y_28]  = intra_pred_ang_y_28_c;
-    ipred[INTRA_ANG_Y_29]  = intra_pred_ang_y_29_c;
-    ipred[INTRA_ANG_Y_30]  = intra_pred_ang_y_30_c;
-    ipred[INTRA_ANG_Y_31]  = intra_pred_ang_y_31_c;
-    ipred[INTRA_ANG_Y_32]  = intra_pred_ang_y_32_c;
-
-    // TODO: 8bitÇé¿öÏÂ½Ç¶È7¡¢9¡¢11ÐÔÄÜ²»Ò»ÖÂ   20170716
+        ipred[i     ] = intra_pred_ang10_y_c;             // 25 ~ 32
+    }
+
+    ipred[INTRA_ANG_X_3 ]  = intra_pred_ang10_x_3_c;
+    ipred[INTRA_ANG_X_4 ]  = intra_pred_ang10_x_4_c;
+    ipred[INTRA_ANG_X_5 ]  = intra_pred_ang10_x_5_c;
+    ipred[INTRA_ANG_X_6 ]  = intra_pred_ang10_x_6_c;
+    ipred[INTRA_ANG_X_7 ]  = intra_pred_ang10_x_7_c;
+    ipred[INTRA_ANG_X_8 ]  = intra_pred_ang10_x_8_c;
+    ipred[INTRA_ANG_X_9 ]  = intra_pred_ang10_x_9_c;
+    ipred[INTRA_ANG_X_10]  = intra_pred_ang10_x_10_c;
+    ipred[INTRA_ANG_X_11]  = intra_pred_ang10_x_11_c;
+
+    ipred[INTRA_ANG_XY_13] = intra_pred_ang10_xy_13_c;
+    ipred[INTRA_ANG_XY_14] = intra_pred_ang10_xy_14_c;
+    ipred[INTRA_ANG_XY_16] = intra_pred_ang10_xy_16_c;
+    ipred[INTRA_ANG_XY_18] = intra_pred_ang10_xy_18_c;
+    ipred[INTRA_ANG_XY_20] = intra_pred_ang10_xy_20_c;
+    ipred[INTRA_ANG_XY_22] = intra_pred_ang10_xy_22_c;
+    ipred[INTRA_ANG_XY_23] = intra_pred_ang10_xy_23_c;
+
+    ipred[INTRA_ANG_Y_25]  = intra_pred_ang10_y_25_c;
+    ipred[INTRA_ANG_Y_26]  = intra_pred_ang10_y_26_c;
+    ipred[INTRA_ANG_Y_27]  = intra_pred_ang10_y_27_c;
+    ipred[INTRA_ANG_Y_28]  = intra_pred_ang10_y_28_c;
+    ipred[INTRA_ANG_Y_29]  = intra_pred_ang10_y_29_c;
+    ipred[INTRA_ANG_Y_30]  = intra_pred_ang10_y_30_c;
+    ipred[INTRA_ANG_Y_31]  = intra_pred_ang10_y_31_c;
+    ipred[INTRA_ANG_Y_32]  = intra_pred_ang10_y_32_c;
+    }
+
+    // TODO: 8bitæƒ…å†µä¸‹è§’åº¦7ã€9ã€11æ€§èƒ½ä¸ä¸€è‡´   20170716
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_SSE42) {
         ipred[DC_PRED        ] = intra_pred_dc_sse128;
@@ -1933,6 +3627,7 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf)
         ipred[VERT_PRED      ] = intra_pred_ver_sse128;
         ipred[PLANE_PRED     ] = intra_pred_plane_sse128;
         ipred[BI_PRED        ] = intra_pred_bilinear_sse128;
+#if !HIGH_BIT_DEPTH
         ipred[INTRA_ANG_X_3  ] = intra_pred_ang_x_3_sse128;
         ipred[INTRA_ANG_X_4  ] = intra_pred_ang_x_4_sse128;
         ipred[INTRA_ANG_X_5  ] = intra_pred_ang_x_5_sse128;
@@ -1959,9 +3654,11 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf)
         pf->fill_edge_f[1] = fill_edge_samples_x_sse128;
         pf->fill_edge_f[2] = fill_edge_samples_y_sse128;
         pf->fill_edge_f[3] = fill_edge_samples_xy_sse128;
+#endif
     }
 
     /* 8/10bit assemble*/
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
         ipred[DC_PRED        ] = intra_pred_dc_avx;
         ipred[HOR_PRED       ] = intra_pred_hor_avx;
@@ -1994,8 +3691,8 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf)
         ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_avx;
         ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_avx;
         ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_avx;
-
     }
+#endif
 #endif //if HAVE_MMX
 #undef ANG_X_OFFSET
 #undef ANG_XY_OFFSET
diff --git a/source/common/intra.h b/source/common/intra.h
index a7c4a6a..dff0973 100644
--- a/source/common/intra.h
+++ b/source/common/intra.h
@@ -46,24 +46,44 @@ uint32_t xavs2_intra_get_cu_neighbors(xavs2_t *h, cu_t *p_cu, int img_x, int img
 void xavs2_intra_fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, int img_x, int img_y,
                                        int block_x, int block_y, int bsx, int bsy);
 
-#define rdo_get_pred_intra_luma FPFX(rdo_get_pred_intra_luma)
-int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                            pel_t *p_fenc, int mpm[], int blockidx,
+#define rdo_get_pred_intra_luma8 FPFX(rdo_get_pred_intra_luma8)
+int rdo_get_pred_intra_luma8(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                            pel8_t *p_fenc, int mpm[], int blockidx,
                             int block_x, int block_y, int block_w, int block_h);
 
-#define rdo_get_pred_intra_luma_rmd FPFX(rdo_get_pred_intra_luma_rmd)
-int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                pel_t *p_fenc, int mpm[], int blockidx,
+#define rdo_get_pred_intra_luma10 FPFX(rdo_get_pred_intra_luma10)
+int rdo_get_pred_intra_luma10(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                            pel10_t *p_fenc, int mpm[], int blockidx,
+                            int block_x, int block_y, int block_w, int block_h);
+
+#define rdo_get_pred_intra_luma8_rmd FPFX(rdo_get_pred_intra_luma8_rmd)
+int rdo_get_pred_intra_luma8_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                pel8_t *p_fenc, int mpm[], int blockidx,
+                                int block_x, int block_y, int block_w, int block_h);
+
+#define rdo_get_pred_intra_luma10_rmd FPFX(rdo_get_pred_intra_luma10_rmd)
+int rdo_get_pred_intra_luma10_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                pel10_t *p_fenc, int mpm[], int blockidx,
                                 int block_x, int block_y, int block_w, int block_h);
 
-#define rdo_get_pred_intra_luma_cuda FPFX(rdo_get_pred_intra_luma_cuda)
-int rdo_get_pred_intra_luma_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                 pel_t *p_fenc, int mpm[], int blockidx,
+#define rdo_get_pred_intra_luma8_cuda FPFX(rdo_get_pred_intra_luma8_cuda)
+int rdo_get_pred_intra_luma8_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                 pel8_t *p_fenc, int mpm[], int blockidx,
                                  int block_x, int block_y, int block_w, int block_h);
 
-#define rdo_get_pred_intra_luma_2nd_pass FPFX(rdo_get_pred_intra_luma_2nd_pass)
-int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                     pel_t *p_fenc, int mpm[], int blockidx,
+#define rdo_get_pred_intra_luma10_cuda FPFX(rdo_get_pred_intra_luma10_cuda)
+int rdo_get_pred_intra_luma10_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                 pel10_t *p_fenc, int mpm[], int blockidx,
+                                 int block_x, int block_y, int block_w, int block_h);
+
+#define rdo_get_pred_intra_luma8_2nd_pass FPFX(rdo_get_pred_intra_luma8_2nd_pass)
+int rdo_get_pred_intra_luma8_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                     pel8_t *p_fenc, int mpm[], int blockidx,
+                                     int block_x, int block_y, int block_w, int block_h);
+
+#define rdo_get_pred_intra_luma10_2nd_pass FPFX(rdo_get_pred_intra_luma10_2nd_pass)
+int rdo_get_pred_intra_luma10_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                     pel10_t *p_fenc, int mpm[], int blockidx,
                                      int block_x, int block_y, int block_w, int block_h);
 
 #define rdo_get_pred_intra_chroma FPFX(rdo_get_pred_intra_chroma)
diff --git a/source/common/mc.c b/source/common/mc.c
index c1da03e..7bba8e5 100644
--- a/source/common/mc.c
+++ b/source/common/mc.c
@@ -167,10 +167,20 @@ enum intpl_pos_e {
 /* ---------------------------------------------------------------------------
  */
 static void
-mc_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)
+mc_copy8_c(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h)
 {
     while (h--) {
-        memcpy(dst, src, w * sizeof(pel_t));
+        memcpy(dst, src, w * sizeof(pel8_t));
+        dst += i_dst;
+        src += i_src;
+    }
+}
+
+static void
+mc_copy10_c(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h)
+{
+    while (h--) {
+        memcpy(dst, src, w * sizeof(pel10_t));
         dst += i_dst;
         src += i_src;
     }
@@ -180,19 +190,29 @@ mc_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)
  * plane copy
  */
 static void
-plane_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)
+plane_copy8_c(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h)
+{
+    while (h--) {
+        memcpy(dst, src, w * sizeof(pel8_t));
+        dst += i_dst;
+        src += i_src;
+    }
+}
+
+static void
+plane_copy10_c(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h)
 {
     while (h--) {
-        memcpy(dst, src, w * sizeof(pel_t));
+        memcpy(dst, src, w * sizeof(pel10_t));
         dst += i_dst;
         src += i_src;
     }
 }
 
-#define PLANE_COPY(align, cpu) \
-void plane_copy_##cpu(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)\
+#define PLANE_COPY8(align, cpu) \
+void plane_copy8_##cpu(pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h)\
 {\
-    int c_w = (align) / sizeof(pel_t) - 1;\
+    int c_w = (align) / sizeof(pel8_t) - 1;\
     if (w < 256) { /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
         plane_copy_c( dst, i_dst, src, i_src, w, h );\
     } else if (!(w & c_w)) {\
@@ -208,19 +228,56 @@ void plane_copy_##cpu(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, in
             }\
         }\
         /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
-        memcpy( dst, src, w*sizeof(pel_t) );\
+        memcpy( dst, src, w*sizeof(pel8_t) );\
+    }\
+}
+
+#define PLANE_COPY10(align, cpu) \
+void plane_copy10_##cpu(pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h)\
+{\
+    int c_w = (align) / sizeof(pel10_t) - 1;\
+    if (w < 256) { /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    } else if (!(w & c_w)) {\
+        xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    } else {\
+        if (--h > 0) {\
+            if( i_src > 0 ) {\
+                xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            } else {\
+                xavs2_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+            }\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pel10_t) );\
     }\
 }
 
 #if HAVE_MMX
-PLANE_COPY(16, mmx2)
+PLANE_COPY8(16, mmx2)
+PLANE_COPY10(16, mmx2)
 #endif
 
 /* ---------------------------------------------------------------------------
  * deinterleave copy, for chroma planes
  */
 static void
-plane_copy_deinterleave_c(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h)
+plane_copy8_deinterleave_c(xavs2_t *bb, pel8_t *dstu, intptr_t i_dstu, pel8_t *dstv, intptr_t i_dstv, pel8_t *src, intptr_t i_src, int w, int h)
+{
+    int x, y;
+
+    for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) {
+        for (x = 0; x < w; x++) {
+            dstu[x] = src[2*x    ];
+            dstv[x] = src[2*x + 1];
+        }
+    }
+}
+
+static void
+plane_copy10_deinterleave_c(xavs2_t *bb, pel10_t *dstu, intptr_t i_dstu, pel10_t *dstv, intptr_t i_dstv, pel10_t *src, intptr_t i_src, int w, int h)
 {
     int x, y;
 
@@ -272,48 +329,92 @@ void mem_repeat_8i_c(void *dst, int val, size_t count)
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_chroma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+intpl_chroma8_block_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6;
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_chroma10_block_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
         }
         src += i_src;
         dst += i_dst;
     }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_chroma_block_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+intpl_chroma8_block_ver_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6;
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_chroma10_block_ver_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
         }
         src += i_src;
         dst += i_dst;
     }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
+intpl_chroma8_block_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     ALIGN16(int32_t tmp_res[(32 + 3) * 32]);
     int32_t *tmp = tmp_res;
-    const int shift1 = g_bit_depth - 8;
+    const int shift1 = h->param->input_sample_bit_depth - 8;
     const int add1   = (1 << shift1) >> 1;
-    const int shift2 = 20 - g_bit_depth;
-    const int add2   = 1 << (shift2 - 1); // 1<<(19-g_bit_depth)
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
+    const int add2   = 1 << (shift2 - 1); // 1<<(19-h->param->input_sample_bit_depth)
     int x, y, v;
 
     src -= i_src;
@@ -329,45 +430,103 @@ intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
         }
         dst += i_dst;
         tmp += 32;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_chroma10_block_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    ALIGN16(int32_t tmp_res[(32 + 3) * 32]);
+    int32_t *tmp = tmp_res;
+    const int shift1 = h->param->input_sample_bit_depth - 8;
+    const int add1   = (1 << shift1) >> 1;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
+    const int add2   = 1 << (shift2 - 1); // 1<<(19-h->param->input_sample_bit_depth)
+    int x, y, v;
+
+    src -= i_src;
+    for (y = -1; y < height + 2; y++) {
+        for (x = 0; x < width; x++) {
+            v = FLT_4TAP_HOR(src, x, coeff_h);
+            tmp[x] = (v + add1) >> shift1;
+        }
+        src += i_src;
+        tmp += 32;
+    }
+    tmp = tmp_res + 32;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2;
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+        dst += i_dst;
+        tmp += 32;
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+intpl_luma8_block_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
         }
         src += i_src;
         dst += i_dst;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_block_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6;
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
-#define intpl_luma_block_ver_c intpl_luma_ver_c
+#define intpl_luma8_block_ver_c intpl_luma8_ver_c
+#define intpl_luma10_block_ver_c intpl_luma10_ver_c
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
+intpl_luma8_block_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
 #define TMP_STRIDE      64
 
-    const int shift1 = g_bit_depth - 8;
+    const int shift1 = h->param->input_sample_bit_depth - 8;
     const int add1   = (1 << shift1) >> 1;
-    const int shift2 = 20 - g_bit_depth;
-    const int add2   = 1 << (shift2 - 1);//1<<(19-bit_depth)
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
+    const int add2   = 1 << (shift2 - 1);//1<<(19-h->input_sample_bit_depth)
 
     ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]);
     mct_t *tmp = tmp_buf;
@@ -387,7 +546,7 @@ intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width,
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
         }
 
         dst += i_dst;
@@ -395,80 +554,198 @@ intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width,
     }
 
 #undef TMP_STRIDE
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_block_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+#define TMP_STRIDE      64
+
+    const int shift1 = h->param->input_sample_bit_depth - 8;
+    const int add1   = (1 << shift1) >> 1;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
+    const int add2   = 1 << (shift2 - 1);//1<<(19-h->input_sample_bit_depth)
+
+    ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]);
+    mct_t *tmp = tmp_buf;
+    int x, y, v;
+
+    src -= 3 * i_src;
+    for (y = -3; y < height + 4; y++) {
+        for (x = 0; x < width; x++) {
+            v = FLT_8TAP_HOR(src, x, coeff_h);
+            tmp[x] = (mct_t)((v + add1) >> shift1);
+        }
+        src += i_src;
+        tmp += TMP_STRIDE;
+    }
+
+    tmp = tmp_buf + 3 * TMP_STRIDE;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2;
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+
+        dst += i_dst;
+        tmp += TMP_STRIDE;
+    }
+
+#undef TMP_STRIDE
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_hor_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+intpl_luma8_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel8_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = FLT_8TAP_HOR(src, x, coeff);
             tmp[x] = (mct_t)v;
-            dst[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
         }
         src += i_src;
         tmp += i_tmp;
         dst += i_dst;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel10_t *src, int i_src, int width, int height, int8_t const *coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = FLT_8TAP_HOR(src, x, coeff);
+            tmp[x] = (mct_t)v;
+            dst[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
+        }
+        src += i_src;
+        tmp += i_tmp;
+        dst += i_dst;
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+intpl_luma8_ver_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             int v = FLT_8TAP_VER(src, x, i_src, coeff);
             v = (v + 32) >> 6;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
         }
         src += i_src;
         dst += i_dst;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_ver_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int v = FLT_8TAP_VER(src, x, i_src, coeff);
+            v = (v + 32) >> 6;
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_ver_x3_c(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff)
+intpl_luma8_ver_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const **coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+    pel8_t *dst0 = dst[0];
+    pel8_t *dst1 = dst[1];
+    pel8_t *dst2 = dst[2];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            v = FLT_8TAP_VER(src, x, i_src, coeff[0]);
+            dst0[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
+            v = FLT_8TAP_VER(src, x, i_src, coeff[1]);
+            dst1[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
+            v = FLT_8TAP_VER(src, x, i_src, coeff[2]);
+            dst2[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
+        }
+        src  += i_src;
+        dst0 += i_dst;
+        dst1 += i_dst;
+        dst2 += i_dst;
+    }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_ver_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const **coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
-    pel_t *dst0 = dst[0];
-    pel_t *dst1 = dst[1];
-    pel_t *dst2 = dst[2];
+    pel10_t *dst0 = dst[0];
+    pel10_t *dst1 = dst[1];
+    pel10_t *dst2 = dst[2];
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             v = FLT_8TAP_VER(src, x, i_src, coeff[0]);
-            dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst0[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
             v = FLT_8TAP_VER(src, x, i_src, coeff[1]);
-            dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst1[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
             v = FLT_8TAP_VER(src, x, i_src, coeff[2]);
-            dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst2[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
         }
         src  += i_src;
         dst0 += i_dst;
         dst1 += i_dst;
         dst2 += i_dst;
     }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff)
+intpl_luma8_hor_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t **coeff)
 {
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
     int x, y, v;
-    pel_t *dst0 = dst[0];
-    pel_t *dst1 = dst[1];
-    pel_t *dst2 = dst[2];
+    pel8_t *dst0 = dst[0];
+    pel8_t *dst1 = dst[1];
+    pel8_t *dst2 = dst[2];
     mct_t *tmp0 = tmp[0];
     mct_t *tmp1 = tmp[1];
     mct_t *tmp2 = tmp[2];
@@ -477,13 +754,13 @@ intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_t
         for(x = 0; x < width; x++) {
             v = FLT_8TAP_HOR(src, x, coeff[0]);
             tmp0[x] = (mct_t)v;
-            dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst0[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
             v = FLT_8TAP_HOR(src, x, coeff[1]);
             tmp1[x] = (mct_t)v;
-            dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst1[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
             v = FLT_8TAP_HOR(src, x, coeff[2]);
             tmp2[x] = (mct_t)v;
-            dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6);
+            dst2[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6);
         }
         src  += i_src;
         tmp0 += i_tmp;
@@ -493,57 +770,157 @@ intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_t
         dst1 += i_dst;
         dst2 += i_dst;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_hor_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t **coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    int x, y, v;
+    pel10_t *dst0 = dst[0];
+    pel10_t *dst1 = dst[1];
+    pel10_t *dst2 = dst[2];
+    mct_t *tmp0 = tmp[0];
+    mct_t *tmp1 = tmp[1];
+    mct_t *tmp2 = tmp[2];
+
+    for (y = 0; y < height; y++) {
+        for(x = 0; x < width; x++) {
+            v = FLT_8TAP_HOR(src, x, coeff[0]);
+            tmp0[x] = (mct_t)v;
+            dst0[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
+            v = FLT_8TAP_HOR(src, x, coeff[1]);
+            tmp1[x] = (mct_t)v;
+            dst1[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
+            v = FLT_8TAP_HOR(src, x, coeff[2]);
+            tmp2[x] = (mct_t)v;
+            dst2[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6);
+        }
+        src  += i_src;
+        tmp0 += i_tmp;
+        tmp1 += i_tmp;
+        tmp2 += i_tmp;
+        dst0 += i_dst;
+        dst1 += i_dst;
+        dst2 += i_dst;
+    }
+#undef XAVS2_CLIP1
 }
 
 /* ---------------------------------------------------------------------------
  */
 static void
-intpl_luma_ext_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff)
+intpl_luma8_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff)
 {
-    const int MC_SHIFT = 20 - g_bit_depth;
-    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-g_bit_depth))
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    const int MC_SHIFT = 20 - h->param->input_sample_bit_depth;
+    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-h->param->input_sample_bit_depth))
     int x, y;
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             int v = FLT_8TAP_VER(tmp, x, i_tmp, coeff);
             v = (v + MC_ADD) >> MC_SHIFT;
-            dst[x] = (pel_t)XAVS2_CLIP1(v);
+            dst[x] = (pel8_t)XAVS2_CLIP1(v);
         }
         dst += i_dst;
         tmp += i_tmp;
     }
+#undef XAVS2_CLIP1
 }
 
 static void
-intpl_luma_ext_x3_c(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff)
+intpl_luma10_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff)
 {
-    const int MC_SHIFT = 20 - g_bit_depth;
-    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-g_bit_depth))
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    const int MC_SHIFT = 20 - h->param->input_sample_bit_depth;
+    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-h->param->input_sample_bit_depth))
     int x, y;
 
-    pel_t *dst0 = dst[0];
-    pel_t *dst1 = dst[1];
-    pel_t *dst2 = dst[2];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int v = FLT_8TAP_VER(tmp, x, i_tmp, coeff);
+            v = (v + MC_ADD) >> MC_SHIFT;
+            dst[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+        dst += i_dst;
+        tmp += i_tmp;
+    }
+#undef XAVS2_CLIP1
+}
+
+/* ---------------------------------------------------------------------------
+ */
+static void
+intpl_luma8_ext_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    const int MC_SHIFT = 20 - h->param->input_sample_bit_depth;
+    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-h->param->input_sample_bit_depth))
+    int x, y;
+
+    pel8_t *dst0 = dst[0];
+    pel8_t *dst1 = dst[1];
+    pel8_t *dst2 = dst[2];
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             int v;
             v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[0]);
             v = (v + MC_ADD) >> MC_SHIFT;
-            dst0[x] = (pel_t)XAVS2_CLIP1(v);
+            dst0[x] = (pel8_t)XAVS2_CLIP1(v);
             v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[1]);
             v = (v + MC_ADD) >> MC_SHIFT;
-            dst1[x] = (pel_t)XAVS2_CLIP1(v);
+            dst1[x] = (pel8_t)XAVS2_CLIP1(v);
             v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[2]);
             v = (v + MC_ADD) >> MC_SHIFT;
-            dst2[x] = (pel_t)XAVS2_CLIP1(v);
+            dst2[x] = (pel8_t)XAVS2_CLIP1(v);
         }
         dst0 += i_dst;
         dst1 += i_dst;
         dst2 += i_dst;
         tmp  += i_tmp;
     }
+#undef XAVS2_CLIP1
+}
+
+static void
+intpl_luma10_ext_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff)
+{
+#define XAVS2_CLIP1(a)        ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a)))
+
+    const int MC_SHIFT = 20 - h->param->input_sample_bit_depth;
+    const int MC_ADD = 1 << (MC_SHIFT - 1);   // (1 << (19-h->param->input_sample_bit_depth))
+    int x, y;
+
+    pel10_t *dst0 = dst[0];
+    pel10_t *dst1 = dst[1];
+    pel10_t *dst2 = dst[2];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int v;
+            v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[0]);
+            v = (v + MC_ADD) >> MC_SHIFT;
+            dst0[x] = (pel10_t)XAVS2_CLIP1(v);
+            v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[1]);
+            v = (v + MC_ADD) >> MC_SHIFT;
+            dst1[x] = (pel10_t)XAVS2_CLIP1(v);
+            v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[2]);
+            v = (v + MC_ADD) >> MC_SHIFT;
+            dst2[x] = (pel10_t)XAVS2_CLIP1(v);
+        }
+        dst0 += i_dst;
+        dst1 += i_dst;
+        dst2 += i_dst;
+        tmp  += i_tmp;
+    }
+#undef XAVS2_CLIP1
 }
 
 /**
@@ -556,7 +933,7 @@ intpl_luma_ext_x3_c(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int w
  * predict one component of a luma block
  *   ref_idx - reference frame (0.. / -1:backward)
  */
-void mc_luma(pel_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y,
+void mc_luma8(xavs2_t *h, pel8_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y,
              int width, int height, const xavs2_frame_t *p_ref_frm)
 {
     int x = (pix_quad_x >> 2);
@@ -564,24 +941,56 @@ void mc_luma(pel_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y,
     int dx = pix_quad_x & 3;
     int dy = pix_quad_y & 3;
     int i_src = p_ref_frm->i_stride[0];
-    pel_t *src = p_ref_frm->filtered[(dy << 2) + dx];
+    pel8_t *src = p_ref_frm->filtered8[(dy << 2) + dx];
 
     /* fetch prediction result */
 #if ENABLE_FRAME_SUBPEL_INTPL
     if (src != NULL) {
         src += y * i_src + x;
-        g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
+        g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
     } else {
 #endif
-        src = p_ref_frm->filtered[0] + y * i_src + x;
+        src = p_ref_frm->filtered8[0] + y * i_src + x;
         if (dx == 0 && dy == 0) {
-            g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
+            g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
         } else if (dy == 0) {
-            g_funcs.intpl_luma_block_hor(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]);
+            g_funcs.intpl_luma8_block_hor(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]);
         } else if (dx == 0) {
-            g_funcs.intpl_luma_block_ver(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]);
+            g_funcs.intpl_luma8_block_ver(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]);
         } else {
-            g_funcs.intpl_luma_block_ext(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]);
+            g_funcs.intpl_luma8_block_ext(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]);
+        }
+#if ENABLE_FRAME_SUBPEL_INTPL
+    }
+#endif
+}
+
+void mc_luma10(xavs2_t *h, pel10_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y,
+             int width, int height, const xavs2_frame_t *p_ref_frm)
+{
+    int x = (pix_quad_x >> 2);
+    int y = (pix_quad_y >> 2);
+    int dx = pix_quad_x & 3;
+    int dy = pix_quad_y & 3;
+    int i_src = p_ref_frm->i_stride[0];
+    pel10_t *src = p_ref_frm->filtered10[(dy << 2) + dx];
+
+    /* fetch prediction result */
+#if ENABLE_FRAME_SUBPEL_INTPL
+    if (src != NULL) {
+        src += y * i_src + x;
+        g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
+    } else {
+#endif
+        src = p_ref_frm->filtered10[0] + y * i_src + x;
+        if (dx == 0 && dy == 0) {
+            g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred, i_pred, src, i_src);
+        } else if (dy == 0) {
+            g_funcs.intpl_luma10_block_hor(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]);
+        } else if (dx == 0) {
+            g_funcs.intpl_luma10_block_ver(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]);
+        } else {
+            g_funcs.intpl_luma10_block_ext(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]);
         }
 #if ENABLE_FRAME_SUBPEL_INTPL
     }
@@ -596,10 +1005,7 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he
     int i_tmp   = frm->i_width[IMG_Y] + 2 * XAVS2_PAD;
     int width   = frm->i_width[IMG_Y] + 2 * PAD_OFFSET;
     int off_dst = start_y * stride - PAD_OFFSET;
-    pel_t *src  = frm->planes[IMG_Y] + off_dst; // reconstructed luma plane
-    pel_t *p_dst[3];
     const int8_t *p_coeffs[3];
-    pel_t *dst;
     mct_t *intpl_tmp[3];
 
     /* -------------------------------------------------------------
@@ -633,27 +1039,150 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he
 
     /* -------------------------------------------------------------
      * interpolate horizontal positions: a.b,c */
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *src  = frm->planes8[IMG_Y] + off_dst; // reconstructed luma plane
+    pel8_t *p_dst[3];
+    pel8_t *dst;
+
     {
-        const int shift_h = 4;   // ÍùÉÏÆ«ÒÆ4ÐÐÖØÐÂ²åÖµÒÔ²¢ÐÐ
+        const int shift_h = 4;   // å¾€ä¸Šåç§»4è¡Œé‡æ–°æ’å€¼ä»¥å¹¶è¡Œ
         intpl_tmp[0] -= shift_h * i_tmp;
         intpl_tmp[1] -= shift_h * i_tmp;
         intpl_tmp[2] -= shift_h * i_tmp;
         src          -= shift_h * stride;
         if (h->use_fractional_me > 1) {
-            p_dst[0] = frm->filtered[INTPL_POS_A] + off_dst - shift_h * stride;  // a
+            p_dst[0] = frm->filtered8[INTPL_POS_A] + off_dst - shift_h * stride;  // a
             p_coeffs[0] = INTPL_FILTERS[INTPL_POS_A];         // a
 
-            p_dst[1] = frm->filtered[INTPL_POS_B] + off_dst - shift_h * stride;  // b
+            p_dst[1] = frm->filtered8[INTPL_POS_B] + off_dst - shift_h * stride;  // b
             p_coeffs[1] = INTPL_FILTERS[INTPL_POS_B];         // b
 
-            p_dst[2] = frm->filtered[INTPL_POS_C] + off_dst - shift_h * stride;  // c
+            p_dst[2] = frm->filtered8[INTPL_POS_C] + off_dst - shift_h * stride;  // c
             p_coeffs[2] = INTPL_FILTERS[INTPL_POS_C];         // c
 
-            g_funcs.intpl_luma_hor_x3(p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs);
+            g_funcs.intpl_luma8_hor_x3(h, p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs);
+        } else {
+            // b
+            dst = frm->filtered8[INTPL_POS_B] + off_dst - 4 * stride;
+
+            g_funcs.intpl_luma8_hor(h, dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]);
+        }
+        src          += shift_h * stride;
+        intpl_tmp[0] += shift_h * i_tmp;
+        intpl_tmp[1] += shift_h * i_tmp;
+        intpl_tmp[2] += shift_h * i_tmp;
+    }
+
+    /* -------------------------------------------------------------
+     * interpolate vertical positions: d,h,n */
+    if (h->use_fractional_me > 1) {
+        p_dst[0] = frm->filtered8[INTPL_POS_D] + off_dst;  // d
+        p_coeffs[0] = INTPL_FILTERS[INTPL_POS_D >> 2];    // d
+
+        p_dst[1] = frm->filtered8[INTPL_POS_H] + off_dst;  // h
+        p_coeffs[1] = INTPL_FILTERS[INTPL_POS_H >> 2];    // h
+
+        p_dst[2] = frm->filtered8[INTPL_POS_N] + off_dst;  // n
+        p_coeffs[2] = INTPL_FILTERS[INTPL_POS_N >> 2];    // n
+
+        g_funcs.intpl_luma8_ver_x3(h, p_dst, stride, src, stride, width, height, p_coeffs);
+    } else {
+        p_dst[1] = frm->filtered8[INTPL_POS_H] + off_dst;  // h
+
+        g_funcs.intpl_luma8_ver(h, p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]);
+    }
+
+    /* -------------------------------------------------------------
+     * interpolate tilt positions: [e,f,g; i,j,k; p,q,r] */
+    if (h->use_fractional_me > 1) {
+        // --- for e,i,p ---
+        p_dst[0] = frm->filtered8[INTPL_POS_E] + off_dst;  // e
+        p_coeffs[0] = INTPL_FILTERS[INTPL_POS_E >> 2];    // e
+
+        p_dst[1] = frm->filtered8[INTPL_POS_I] + off_dst;  // i
+        p_coeffs[1] = INTPL_FILTERS[INTPL_POS_I >> 2];    // i
+
+        p_dst[2] = frm->filtered8[INTPL_POS_P] + off_dst;  // p
+        p_coeffs[2] = INTPL_FILTERS[INTPL_POS_P >> 2];    // p
+
+        g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs);
+
+        // --- for f,j,q ---
+        p_dst[0] = frm->filtered8[INTPL_POS_F] + off_dst;  // f
+        p_coeffs[0] = INTPL_FILTERS[INTPL_POS_F >> 2];    // f
+
+        p_dst[1] = frm->filtered8[INTPL_POS_J] + off_dst;  // j
+        p_coeffs[1] = INTPL_FILTERS[INTPL_POS_J >> 2];    // j
+
+        p_dst[2] = frm->filtered8[INTPL_POS_Q] + off_dst;  // q
+        p_coeffs[2] = INTPL_FILTERS[INTPL_POS_Q >> 2];    // q
+
+        g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs);
+
+        // --- for g,k,r ---
+        p_dst[0] = frm->filtered8[INTPL_POS_G] + off_dst;  // g
+        p_coeffs[0] = INTPL_FILTERS[INTPL_POS_G >> 2];    // g
+
+        p_dst[1] = frm->filtered8[INTPL_POS_K] + off_dst;  // k
+        p_coeffs[1] = INTPL_FILTERS[INTPL_POS_K >> 2];    // k
+
+        p_dst[2] = frm->filtered8[INTPL_POS_R] + off_dst;  // r
+        p_coeffs[2] = INTPL_FILTERS[INTPL_POS_R >> 2];    // r
+
+        g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs);
+    } else {
+        // j
+        dst = frm->filtered8[INTPL_POS_J] + off_dst;
+
+        g_funcs.intpl_luma8_ext(h, dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]);
+    }
+
+    /* ---------------------------------------------------------------------------
+     * expand border for all 15 filtered planes */
+    {
+        const int padh = XAVS2_PAD - PAD_OFFSET;
+        const int padv = XAVS2_PAD - PAD_OFFSET;
+        int i;
+
+        width  = frm->i_width[IMG_Y] + PAD_OFFSET * 2;
+
+        /* loop over all 15 filtered planes */
+        for (i = 1; i < 16; i++) {
+            pel8_t *pix = frm->filtered8[i];
+            if (pix != NULL) {
+                pix += start_y * stride - PAD_OFFSET;
+                plane_expand_border8(pix, stride, width, height, padh, padv, b_start, b_end);
+            }
+        }
+    }
+    } else {
+    pel10_t *src  = frm->planes10[IMG_Y] + off_dst; // reconstructed luma plane
+    pel10_t *p_dst[3];
+    pel10_t *dst;
+
+    {
+        const int shift_h = 4;   // å¾€ä¸Šåç§»4è¡Œé‡æ–°æ’å€¼ä»¥å¹¶è¡Œ
+        intpl_tmp[0] -= shift_h * i_tmp;
+        intpl_tmp[1] -= shift_h * i_tmp;
+        intpl_tmp[2] -= shift_h * i_tmp;
+        src          -= shift_h * stride;
+        if (h->use_fractional_me > 1) {
+            p_dst[0] = frm->filtered10[INTPL_POS_A] + off_dst - shift_h * stride;  // a
+            p_coeffs[0] = INTPL_FILTERS[INTPL_POS_A];         // a
+
+            p_dst[1] = frm->filtered10[INTPL_POS_B] + off_dst - shift_h * stride;  // b
+            p_coeffs[1] = INTPL_FILTERS[INTPL_POS_B];         // b
+
+            p_dst[2] = frm->filtered10[INTPL_POS_C] + off_dst - shift_h * stride;  // c
+            p_coeffs[2] = INTPL_FILTERS[INTPL_POS_C];
+
+            g_funcs.intpl_luma10_hor_x3(h, p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs);
         } else {
             // b
-            dst = frm->filtered[INTPL_POS_B] + off_dst - 4 * stride;
-            g_funcs.intpl_luma_hor(dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]);
+            dst = frm->filtered10[INTPL_POS_B] + off_dst - 4 * stride;
+
+            g_funcs.intpl_luma10_hor(h, dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]);
         }
         src          += shift_h * stride;
         intpl_tmp[0] += shift_h * i_tmp;
@@ -664,63 +1193,65 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he
     /* -------------------------------------------------------------
      * interpolate vertical positions: d,h,n */
     if (h->use_fractional_me > 1) {
-        p_dst[0] = frm->filtered[INTPL_POS_D] + off_dst;  // d
+        p_dst[0] = frm->filtered10[INTPL_POS_D] + off_dst;  // d
         p_coeffs[0] = INTPL_FILTERS[INTPL_POS_D >> 2];    // d
 
-        p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst;  // h
+        p_dst[1] = frm->filtered10[INTPL_POS_H] + off_dst;  // h
         p_coeffs[1] = INTPL_FILTERS[INTPL_POS_H >> 2];    // h
 
-        p_dst[2] = frm->filtered[INTPL_POS_N] + off_dst;  // n
+        p_dst[2] = frm->filtered10[INTPL_POS_N] + off_dst;  // n
         p_coeffs[2] = INTPL_FILTERS[INTPL_POS_N >> 2];    // n
 
-        g_funcs.intpl_luma_ver_x3(p_dst, stride, src, stride, width, height, p_coeffs);
+        g_funcs.intpl_luma10_ver_x3(h, p_dst, stride, src, stride, width, height, p_coeffs);
     } else {
-        p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst;  // h
-        g_funcs.intpl_luma_ver(p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]);
+        p_dst[1] = frm->filtered10[INTPL_POS_H] + off_dst;  // h
+
+        g_funcs.intpl_luma10_ver(h, p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]);
     }
 
     /* -------------------------------------------------------------
      * interpolate tilt positions: [e,f,g; i,j,k; p,q,r] */
     if (h->use_fractional_me > 1) {
         // --- for e,i,p ---
-        p_dst[0] = frm->filtered[INTPL_POS_E] + off_dst;  // e
+        p_dst[0] = frm->filtered10[INTPL_POS_E] + off_dst;  // e
         p_coeffs[0] = INTPL_FILTERS[INTPL_POS_E >> 2];    // e
 
-        p_dst[1] = frm->filtered[INTPL_POS_I] + off_dst;  // i
+        p_dst[1] = frm->filtered10[INTPL_POS_I] + off_dst;  // i
         p_coeffs[1] = INTPL_FILTERS[INTPL_POS_I >> 2];    // i
 
-        p_dst[2] = frm->filtered[INTPL_POS_P] + off_dst;  // p
+        p_dst[2] = frm->filtered10[INTPL_POS_P] + off_dst;  // p
         p_coeffs[2] = INTPL_FILTERS[INTPL_POS_P >> 2];    // p
 
-        g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs);
+        g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs);
 
         // --- for f,j,q ---
-        p_dst[0] = frm->filtered[INTPL_POS_F] + off_dst;  // f
+        p_dst[0] = frm->filtered10[INTPL_POS_F] + off_dst;  // f
         p_coeffs[0] = INTPL_FILTERS[INTPL_POS_F >> 2];    // f
 
-        p_dst[1] = frm->filtered[INTPL_POS_J] + off_dst;  // j
+        p_dst[1] = frm->filtered10[INTPL_POS_J] + off_dst;  // j
         p_coeffs[1] = INTPL_FILTERS[INTPL_POS_J >> 2];    // j
 
-        p_dst[2] = frm->filtered[INTPL_POS_Q] + off_dst;  // q
+        p_dst[2] = frm->filtered10[INTPL_POS_Q] + off_dst;  // q
         p_coeffs[2] = INTPL_FILTERS[INTPL_POS_Q >> 2];    // q
 
-        g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs);
+        g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs);
 
         // --- for g,k,r ---
-        p_dst[0] = frm->filtered[INTPL_POS_G] + off_dst;  // g
+        p_dst[0] = frm->filtered10[INTPL_POS_G] + off_dst;  // g
         p_coeffs[0] = INTPL_FILTERS[INTPL_POS_G >> 2];    // g
 
-        p_dst[1] = frm->filtered[INTPL_POS_K] + off_dst;  // k
+        p_dst[1] = frm->filtered10[INTPL_POS_K] + off_dst;  // k
         p_coeffs[1] = INTPL_FILTERS[INTPL_POS_K >> 2];    // k
 
-        p_dst[2] = frm->filtered[INTPL_POS_R] + off_dst;  // r
+        p_dst[2] = frm->filtered10[INTPL_POS_R] + off_dst;  // r
         p_coeffs[2] = INTPL_FILTERS[INTPL_POS_R >> 2];    // r
 
-        g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs);
+        g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs);
     } else {
         // j
-        dst = frm->filtered[INTPL_POS_J] + off_dst;
-        g_funcs.intpl_luma_ext(dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]);
+        dst = frm->filtered10[INTPL_POS_J] + off_dst;
+
+        g_funcs.intpl_luma10_ext(h, dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]);
     }
 
     /* ---------------------------------------------------------------------------
@@ -734,13 +1265,14 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he
 
         /* loop over all 15 filtered planes */
         for (i = 1; i < 16; i++) {
-            pel_t *pix = frm->filtered[i];
+            pel10_t *pix = frm->filtered10[i];
             if (pix != NULL) {
                 pix += start_y * stride - PAD_OFFSET;
-                plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end);
+                plane_expand_border10(pix, stride, width, height, padh, padv, b_start, b_end);
             }
         }
     }
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -754,7 +1286,7 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y)
     int height;
     slice_t *slice = h->slices[h->i_slice_index];
 
-    /* ÓÐÐ§²åÖµÏñËØÇøÓòµÄÆðÊ¼ºÍ½áÊøÐÐºÅ */
+    /* æœ‰æ•ˆæ’å€¼åƒç´ åŒºåŸŸçš„èµ·å§‹å’Œç»“æŸè¡Œå· */
     if (b_start) {
         y_start -= PAD_OFFSET;
     } else {
@@ -766,14 +1298,14 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y)
         y_end -= MC_OFFSET;
     }
 
-    /* ¶àsliceÊ±¼õÉÙÈßÓàÔËËã */
+    /* å¤šsliceæ—¶å‡å°‘å†—ä½™è¿ç®— */
     if (h->param->slice_num > 1 && !b_start && !b_end) {
         if (slice->i_first_lcu_y == i_lcu_y) {
-            /* SliceµÄÉÏ±ß½ç */
+            /* Sliceçš„ä¸Šè¾¹ç•Œ */
             y_start += (MC_OFFSET + PAD_OFFSET);
         }
         if (slice->i_last_lcu_y == i_lcu_y) {
-            /* SliceµÄÏÂ±ß½ç */
+            /* Sliceçš„ä¸‹è¾¹ç•Œ */
             y_end += PAD_OFFSET;
         }
     }
@@ -794,15 +1326,15 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y)
 /* ---------------------------------------------------------------------------
  * predict one component of a chroma block
  */
-void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred,
+void mc_chroma8(xavs2_t *h, pel8_t *p_pred_u, pel8_t *p_pred_v, int i_pred,
                int pix_quad_x, int pix_quad_y, int width, int height,
                const xavs2_frame_t *p_ref_frm)
 {
     int posx = pix_quad_x & 7;
     int posy = pix_quad_y & 7;
     int i_src = p_ref_frm->i_stride[IMG_U];
-    pel_t *p_src_u = p_ref_frm->planes[IMG_U];
-    pel_t *p_src_v = p_ref_frm->planes[IMG_V];
+    pel8_t *p_src_u = p_ref_frm->planes8[IMG_U];
+    pel8_t *p_src_v = p_ref_frm->planes8[IMG_V];
     int src_offset = (pix_quad_y >> 3) * i_src + (pix_quad_x >> 3);
 
     p_src_u += src_offset;
@@ -810,21 +1342,55 @@ void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred,
 
     if (posy == 0 && posx == 0) {
         if (width != 2 && width != 6 && height != 2 && height != 6) {
-            g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src);
-            g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src);
+            g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src);
+            g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src);
         } else {
-            g_funcs.align_copy(p_pred_u, i_pred, p_src_u, i_src, width, height);
-            g_funcs.align_copy(p_pred_v, i_pred, p_src_v, i_src, width, height);
+            g_funcs.align_copy8(h, p_pred_u, i_pred, p_src_u, i_src, width, height);
+            g_funcs.align_copy8(h, p_pred_v, i_pred, p_src_v, i_src, width, height);
         }
     } else if (posy == 0) {
-        g_funcs.intpl_chroma_block_hor(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]);
-        g_funcs.intpl_chroma_block_hor(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]);
+        g_funcs.intpl_chroma8_block_hor(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]);
+        g_funcs.intpl_chroma8_block_hor(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]);
     } else if (posx == 0) {
-        g_funcs.intpl_chroma_block_ver(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]);
-        g_funcs.intpl_chroma_block_ver(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma8_block_ver(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma8_block_ver(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]);
     } else {
-        g_funcs.intpl_chroma_block_ext(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
-        g_funcs.intpl_chroma_block_ext(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma8_block_ext(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma8_block_ext(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
+    }
+}
+
+void mc_chroma10(xavs2_t *h, pel10_t *p_pred_u, pel10_t *p_pred_v, int i_pred,
+               int pix_quad_x, int pix_quad_y, int width, int height,
+               const xavs2_frame_t *p_ref_frm)
+{
+    int posx = pix_quad_x & 7;
+    int posy = pix_quad_y & 7;
+    int i_src = p_ref_frm->i_stride[IMG_U];
+    pel10_t *p_src_u = p_ref_frm->planes10[IMG_U];
+    pel10_t *p_src_v = p_ref_frm->planes10[IMG_V];
+    int src_offset = (pix_quad_y >> 3) * i_src + (pix_quad_x >> 3);
+
+    p_src_u += src_offset;
+    p_src_v += src_offset;
+
+    if (posy == 0 && posx == 0) {
+        if (width != 2 && width != 6 && height != 2 && height != 6) {
+            g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src);
+            g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src);
+        } else {
+            g_funcs.align_copy10(h, p_pred_u, i_pred, p_src_u, i_src, width, height);
+            g_funcs.align_copy10(h, p_pred_v, i_pred, p_src_v, i_src, width, height);
+        }
+    } else if (posy == 0) {
+        g_funcs.intpl_chroma10_block_hor(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]);
+        g_funcs.intpl_chroma10_block_hor(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]);
+    } else if (posx == 0) {
+        g_funcs.intpl_chroma10_block_ver(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma10_block_ver(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]);
+    } else {
+        g_funcs.intpl_chroma10_block_ext(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
+        g_funcs.intpl_chroma10_block_ext(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]);
     }
 }
 
@@ -838,12 +1404,32 @@ void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred,
 
 /* ---------------------------------------------------------------------------
  */
-static void lowres_filter_core_c(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height)
+static void lowres_filter_core8_c(xavs2_t *h, pel8_t *src, int i_src, pel8_t *dst, int i_dst, int width, int height)
+{
+#define FILTER(a,b,c,d) ((((a+b+1)>>1) + ((c+d+1)>>1) + 1) >> 1)
+
+    int i_src2 = i_src << 1;    // stride of 2 src lines
+    int x, y;
+    pel8_t *dwn;
+
+    for (y = 0; y < height; y++) {
+        dwn = src + i_src;      // point to down line of src
+        for (x = 0; x < width; x++) {
+            dst[x] = FILTER(src[2 * x], dwn[2 * x], src[2 * x + 1], dwn[2 * x + 1]);
+        }
+        src += i_src2;
+        dst += i_dst;
+    }
+#undef FILTER
+}
+
+static void lowres_filter_core10_c(xavs2_t *h, pel10_t *src, int i_src, pel10_t *dst, int i_dst, int width, int height)
 {
 #define FILTER(a,b,c,d) ((((a+b+1)>>1) + ((c+d+1)>>1) + 1) >> 1)
+
     int i_src2 = i_src << 1;    // stride of 2 src lines
     int x, y;
-    pel_t *dwn;
+    pel10_t *dwn;
 
     for (y = 0; y < height; y++) {
         dwn = src + i_src;      // point to down line of src
@@ -865,7 +1451,7 @@ static void lowres_filter_core_c(pel_t *src, int i_src, pel_t *dst, int i_dst, i
 /* ---------------------------------------------------------------------------
  * global function set initial
  */
-void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf)
+void xavs2_mem_oper_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf)
 {
     pf->fast_memcpy     = memcpy;
     pf->memcpy_aligned  = memcpy;
@@ -874,10 +1460,14 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf)
     pf->memzero_aligned = memzero_aligned_c;
     pf->mem_repeat_i    = mem_repeat_i_c;
     pf->mem_repeat_p    = memset;
-    pf->lowres_filter   = lowres_filter_core_c;
+    if (param->input_sample_bit_depth == 8) {
+    pf->lowres_filter8   = lowres_filter_core8_c;
+    } else {
+    pf->lowres_filter10   = lowres_filter_core10_c;
+    }
 
 #if ARCH_X86_64
-    pf->mem_repeat_i    = mem_repeat_8i_c;  // x64¼Ü¹¹ÏÂ£¬¼õÉÙÑ­»·´ÎÊýÍ¬Ê±Ê¹ÓÃ64Î»´ò°ü¸³Öµ
+    pf->mem_repeat_i    = mem_repeat_8i_c;  // x64æž¶æž„ä¸‹ï¼Œå‡å°‘å¾ªçŽ¯æ¬¡æ•°åŒæ—¶ä½¿ç”¨64ä½æ‰“åŒ…èµ‹å€¼
 #endif
 
 #if HAVE_MMX
@@ -901,18 +1491,20 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf)
         pf->memzero_aligned = xavs2_memzero_aligned_c_sse2;
         // pf->memcpy_aligned  = xavs2_memcpy_aligned_c_sse2;
         pf->lowres_filter  = xavs2_lowres_filter_core_sse2;
-        // pf->mem_repeat_i  = xavs2_mem_repeat_i_c_sse2;  // TODO: ±ÈC°æ±¾Âý£¬½ûÓÃ
+        // pf->mem_repeat_i  = xavs2_mem_repeat_i_c_sse2;  // TODO: æ¯”Cç‰ˆæœ¬æ…¢ï¼Œç¦ç”¨
     }
 
     if (cpuid & XAVS2_CPU_SSSE3) {
         pf->lowres_filter = xavs2_lowres_filter_core_ssse3;
     }
 
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
         pf->memzero_aligned = xavs2_memzero_aligned_c_avx;
-        // pf->mem_repeat_i    = xavs2_mem_repeat_i_c_avx;  // TODO: ±ÈC°æ±¾Âý£¬½ûÓÃ
+        // pf->mem_repeat_i    = xavs2_mem_repeat_i_c_avx;  // TODO: æ¯”Cç‰ˆæœ¬æ…¢ï¼Œç¦ç”¨
         pf->lowres_filter   = xavs2_lowres_filter_core_avx;
     }
+#endif
 #else
     UNUSED_PARAMETER(cpuid);
 #endif
@@ -920,30 +1512,55 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf)
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf)
+void xavs2_mc_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf)
 {
+    if (param->input_sample_bit_depth == 8) {
     /* align copy */
-    pf->align_copy = mc_copy_c;
+    pf->align_copy8 = mc_copy8_c;
 
     /* plane copy */
-    pf->plane_copy = plane_copy_c;
-    pf->plane_copy_deinterleave = plane_copy_deinterleave_c;
+    pf->plane_copy8 = plane_copy8_c;
+    pf->plane_copy8_deinterleave = plane_copy8_deinterleave_c;
 
     /* interpolate */
-    pf->intpl_luma_hor = intpl_luma_hor_c;
-    pf->intpl_luma_ver = intpl_luma_ver_c;
-    pf->intpl_luma_ext = intpl_luma_ext_c;
+    pf->intpl_luma8_hor = intpl_luma8_hor_c;
+    pf->intpl_luma8_ver = intpl_luma8_ver_c;
+    pf->intpl_luma8_ext = intpl_luma8_ext_c;
+
+    pf->intpl_luma8_ver_x3 = intpl_luma8_ver_x3_c;
+    pf->intpl_luma8_hor_x3 = intpl_luma8_hor_x3_c;
+    pf->intpl_luma8_ext_x3 = intpl_luma8_ext_x3_c;
+
+    pf->intpl_luma8_block_hor   = intpl_luma8_block_hor_c;
+    pf->intpl_luma8_block_ver   = intpl_luma8_block_ver_c;
+    pf->intpl_luma8_block_ext   = intpl_luma8_block_ext_c;
+    pf->intpl_chroma8_block_hor = intpl_chroma8_block_hor_c;
+    pf->intpl_chroma8_block_ver = intpl_chroma8_block_ver_c;
+    pf->intpl_chroma8_block_ext = intpl_chroma8_block_ext_c;
+    } else {
+    /* align copy */
+    pf->align_copy10 = mc_copy10_c;
 
-    pf->intpl_luma_ver_x3 = intpl_luma_ver_x3_c;
-    pf->intpl_luma_hor_x3 = intpl_luma_hor_x3_c;
-    pf->intpl_luma_ext_x3 = intpl_luma_ext_x3_c;
+    /* plane copy */
+    pf->plane_copy10 = plane_copy10_c;
+    pf->plane_copy10_deinterleave = plane_copy10_deinterleave_c;
 
-    pf->intpl_luma_block_hor   = intpl_luma_block_hor_c;
-    pf->intpl_luma_block_ver   = intpl_luma_block_ver_c;
-    pf->intpl_luma_block_ext   = intpl_luma_block_ext_c;
-    pf->intpl_chroma_block_hor = intpl_chroma_block_hor_c;
-    pf->intpl_chroma_block_ver = intpl_chroma_block_ver_c;
-    pf->intpl_chroma_block_ext = intpl_chroma_block_ext_c;
+    /* interpolate */
+    pf->intpl_luma10_hor = intpl_luma10_hor_c;
+    pf->intpl_luma10_ver = intpl_luma10_ver_c;
+    pf->intpl_luma10_ext = intpl_luma10_ext_c;
+
+    pf->intpl_luma10_ver_x3 = intpl_luma10_ver_x3_c;
+    pf->intpl_luma10_hor_x3 = intpl_luma10_hor_x3_c;
+    pf->intpl_luma10_ext_x3 = intpl_luma10_ext_x3_c;
+
+    pf->intpl_luma10_block_hor   = intpl_luma10_block_hor_c;
+    pf->intpl_luma10_block_ver   = intpl_luma10_block_ver_c;
+    pf->intpl_luma10_block_ext   = intpl_luma10_block_ext_c;
+    pf->intpl_chroma10_block_hor = intpl_chroma10_block_hor_c;
+    pf->intpl_chroma10_block_ver = intpl_chroma10_block_ver_c;
+    pf->intpl_chroma10_block_ext = intpl_chroma10_block_ext_c;
+    }
 
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_MMX2) {
@@ -951,6 +1568,7 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf)
         pf->plane_copy_deinterleave = xavs2_plane_copy_deinterleave_mmx;
     }
 
+#if !HIGH_BIT_DEPTH
     if (cpuid & XAVS2_CPU_SSE42) {
         pf->intpl_luma_hor = intpl_luma_hor_sse128;
         pf->intpl_luma_ver = intpl_luma_ver_sse128;
@@ -967,7 +1585,9 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf)
         pf->intpl_chroma_block_ver = intpl_chroma_block_ver_sse128;
         pf->intpl_chroma_block_ext = intpl_chroma_block_ext_sse128;
     }
+#endif
 
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
         pf->intpl_luma_hor = intpl_luma_hor_avx2;
         pf->intpl_luma_ver = intpl_luma_ver_avx2;
@@ -985,6 +1605,7 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf)
         pf->intpl_chroma_block_hor = intpl_chroma_block_hor_avx2;
         pf->intpl_chroma_block_ext = intpl_chroma_block_ext_avx2;
     }
+#endif
 #else
     UNUSED_PARAMETER(cpuid);
 #endif
diff --git a/source/common/mc.h b/source/common/mc.h
index 6df7db4..e635464 100644
--- a/source/common/mc.h
+++ b/source/common/mc.h
@@ -44,16 +44,16 @@
  */
 
 /* ---------------------------------------------------------------------------
- * img_size: ÕûÏñËØ¾«¶ÈµÄÍ¼Ïñ ¿í¶È»ò¸ß¶È £¨ÕûÏñËØ¾«¶È£©
- * blk_size: µ±Ç°Ô¤²â¿éµÄ ¿í¶È»ò¸ß¶È     £¨ÕûÏñËØ¾«¶È£©
- * blk_pos:  µ±Ç°¿éÔÚÍ¼ÏñÖÐµÄ x/y ×ø±ê   £¨ÕûÏñËØ¾«¶È£©
- * mv     :  MV µÄ x/y ·ÖÁ¿             £¨1/4ÏñËØ¾«¶È£©
+ * img_size: æ•´åƒç´ ç²¾åº¦çš„å›¾åƒ å®½åº¦æˆ–é«˜åº¦ ï¼ˆæ•´åƒç´ ç²¾åº¦ï¼‰
+ * blk_size: å½“å‰é¢„æµ‹å—çš„ å®½åº¦æˆ–é«˜åº¦     ï¼ˆæ•´åƒç´ ç²¾åº¦ï¼‰
+ * blk_pos:  å½“å‰å—åœ¨å›¾åƒä¸­çš„ x/y åæ ‡   ï¼ˆæ•´åƒç´ ç²¾åº¦ï¼‰
+ * mv     :  MV çš„ x/y åˆ†é‡             ï¼ˆ1/4åƒç´ ç²¾åº¦ï¼‰
  */
 static INLINE
 int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv)
 {
-    int imv = mv >> 2;  // MVµÄÕûÏñËØ¾«¶È
-    int fmv = mv & 7;   // MVµÄ·ÖÏñËØ¾«¶È²¿·Ö£¬±£Áôµ½ 1/8 ¾«¶È
+    int imv = mv >> 2;  // MVçš„æ•´åƒç´ ç²¾åº¦
+    int fmv = mv & 7;   // MVçš„åˆ†åƒç´ ç²¾åº¦éƒ¨åˆ†ï¼Œä¿ç•™åˆ° 1/8 ç²¾åº¦
 
     if (blk_pos + imv < -blk_size - 8) {
         return ((-blk_size - 8) << 2) + (fmv);
@@ -69,7 +69,7 @@ int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv)
 static ALWAYS_INLINE
 void get_mv_for_mc(xavs2_t *h, mv_t *mv, int pic_pix_x, int pic_pix_y, int blk_w, int blk_h)
 {
-    // WARNING: ÔÚÍ¼Ïñ·Ö±æÂÊÎª 4K ¼°ÒÔÏÂÊ±£¬¾«¶È×ã¹»£»8K Ê±²»¹»ÓÃ
+    // WARNING: åœ¨å›¾åƒåˆ†è¾¨çŽ‡ä¸º 4K åŠä»¥ä¸‹æ—¶ï¼Œç²¾åº¦è¶³å¤Ÿï¼›8K æ—¶ä¸å¤Ÿç”¨
     mv->x = (int16_t)cu_get_mc_pos(h->i_width,  blk_w, pic_pix_x, mv->x);
     mv->y = (int16_t)cu_get_mc_pos(h->i_height, blk_h, pic_pix_y, mv->y);
 }
@@ -85,13 +85,21 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y);
 #define interpolate_sample_rows FPFX(interpolate_sample_rows)
 void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int height, int b_start, int b_end);
 
-#define mc_luma FPFX(mc_luma)
-void mc_luma  (pel_t *p_pred, int i_pred,
+#define mc_luma8 FPFX(mc_luma8)
+void mc_luma8  (xavs2_t *h, pel8_t *p_pred, int i_pred,
+               int pic_pix_x, int pic_pix_y, int width, int height,
+               const xavs2_frame_t *p_ref_frm);
+#define mc_luma10 FPFX(mc_luma10)
+void mc_luma10  (xavs2_t *h, pel10_t *p_pred, int i_pred,
                int pic_pix_x, int pic_pix_y, int width, int height,
                const xavs2_frame_t *p_ref_frm);
 
-#define mc_chroma FPFX(mc_chroma)
-void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred,
+#define mc_chroma8 FPFX(mc_chroma8)
+void mc_chroma8(xavs2_t *h, pel8_t *p_pred_u, pel8_t *p_pred_v, int i_pred,
+               int pix_quad_x, int pix_quad_y, int width, int height,
+               const xavs2_frame_t *p_ref_frm);
+#define mc_chroma10 FPFX(mc_chroma10)
+void mc_chroma10(xavs2_t *h, pel10_t *p_pred_u, pel10_t *p_pred_v, int i_pred,
                int pix_quad_x, int pix_quad_y, int width, int height,
                const xavs2_frame_t *p_ref_frm);
 
diff --git a/source/common/osdep.h b/source/common/osdep.h
index 51a90f7..27ad05b 100644
--- a/source/common/osdep.h
+++ b/source/common/osdep.h
@@ -169,7 +169,7 @@
 #  define ALIGN_256_PTR(p)      (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_256B - 1)) & (~(intptr_t)(CACHE_LINE_256B - 1)))
 
 #if defined(_MSC_VER)
-#pragma warning(disable:4324)   /* disable warning C4324: ÓÉÓÚ __declspec(align())£¬½á¹¹±»Ìî³ä */
+#pragma warning(disable:4324)   /* disable warning C4324: ç”±äºŽ __declspec(align())ï¼Œç»“æž„è¢«å¡«å…… */
 #define DECLARE_ALIGNED(var, n) __declspec(align(n)) var
 #else
 #define DECLARE_ALIGNED(var, n) var __attribute__((aligned(n)))
@@ -216,7 +216,7 @@
 #define ALIGNED_ARRAY_64(...)   EXPAND(ALIGNED_ARRAY_EMU(63, __VA_ARGS__))
 
 /* For AVX2 */
-#if ARCH_X86 || ARCH_X86_64
+#if defined(__AVX2__) && (ARCH_X86 || ARCH_X86_64)
 #define NATIVE_ALIGN            32
 #define ALIGNED_N               ALIGN32
 #define ALIGNED_ARRAY_N         ALIGNED_ARRAY_32
diff --git a/source/common/pixel.c b/source/common/pixel.c
index cf6961d..e64ed04 100644
--- a/source/common/pixel.c
+++ b/source/common/pixel.c
@@ -85,13 +85,14 @@ const uint8_t g_partition_map_tab[] = {
  * ===========================================================================
  */
 
+//#if !HIGH_BIT_DEPTH
 /**
  * ---------------------------------------------------------------------------
  * SAD
  * ---------------------------------------------------------------------------
  */
-#define PIXEL_SAD_C(w, h) \
-static cmp_dist_t xavs2_pixel_sad_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SAD8_C(w, h) \
+static cmp_dist_t xavs2_pixel_sad8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
 {\
     cmp_dist_t sum = 0;\
     int x, y;\
@@ -108,31 +109,75 @@ static cmp_dist_t xavs2_pixel_sad_##w##x##h(const pel_t *pix1, intptr_t i_pix1,
     return sum;\
 }
 
-PIXEL_SAD_C(64, 64)     /* 64x64 */
-PIXEL_SAD_C(64, 32)
-PIXEL_SAD_C(32, 64)
-PIXEL_SAD_C(64, 16)
-PIXEL_SAD_C(64, 48)
-PIXEL_SAD_C(16, 64)
-PIXEL_SAD_C(48, 64)
-PIXEL_SAD_C(32, 32)     /* 32x32 */
-PIXEL_SAD_C(32, 16)
-PIXEL_SAD_C(16, 32)
-PIXEL_SAD_C(32,  8)
-PIXEL_SAD_C(32, 24)
-PIXEL_SAD_C( 8, 32)
-PIXEL_SAD_C(24, 32)
-PIXEL_SAD_C(16, 16)     /* 16x16 */
-PIXEL_SAD_C(16,  8)
-PIXEL_SAD_C( 8, 16)
-PIXEL_SAD_C(16,  4)
-PIXEL_SAD_C(16, 12)
-PIXEL_SAD_C( 4, 16)
-PIXEL_SAD_C(12, 16)
-PIXEL_SAD_C( 8,  8)     /* 8x8 */
-PIXEL_SAD_C( 8,  4)
-PIXEL_SAD_C( 4,  8)
-PIXEL_SAD_C( 4,  4)     /* 4x4 */
+PIXEL_SAD8_C(64, 64)     /* 64x64 */
+PIXEL_SAD8_C(64, 32)
+PIXEL_SAD8_C(32, 64)
+PIXEL_SAD8_C(64, 16)
+PIXEL_SAD8_C(64, 48)
+PIXEL_SAD8_C(16, 64)
+PIXEL_SAD8_C(48, 64)
+PIXEL_SAD8_C(32, 32)     /* 32x32 */
+PIXEL_SAD8_C(32, 16)
+PIXEL_SAD8_C(16, 32)
+PIXEL_SAD8_C(32,  8)
+PIXEL_SAD8_C(32, 24)
+PIXEL_SAD8_C( 8, 32)
+PIXEL_SAD8_C(24, 32)
+PIXEL_SAD8_C(16, 16)     /* 16x16 */
+PIXEL_SAD8_C(16,  8)
+PIXEL_SAD8_C( 8, 16)
+PIXEL_SAD8_C(16,  4)
+PIXEL_SAD8_C(16, 12)
+PIXEL_SAD8_C( 4, 16)
+PIXEL_SAD8_C(12, 16)
+PIXEL_SAD8_C( 8,  8)     /* 8x8 */
+PIXEL_SAD8_C( 8,  4)
+PIXEL_SAD8_C( 4,  8)
+PIXEL_SAD8_C( 4,  4)     /* 4x4 */
+
+#define PIXEL_SAD10_C(w, h) \
+static cmp_dist_t xavs2_pixel_sad10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
+{\
+    cmp_dist_t sum = 0;\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x += 4) {\
+            sum += abs(pix1[x]     - pix2[x]);\
+            sum += abs(pix1[x + 1] - pix2[x + 1]);\
+            sum += abs(pix1[x + 2] - pix2[x + 2]);\
+            sum += abs(pix1[x + 3] - pix2[x + 3]);\
+        }\
+        pix1 += i_pix1;\
+        pix2 += i_pix2;\
+    }\
+    return sum;\
+}
+
+PIXEL_SAD10_C(64, 64)     /* 64x64 */
+PIXEL_SAD10_C(64, 32)
+PIXEL_SAD10_C(32, 64)
+PIXEL_SAD10_C(64, 16)
+PIXEL_SAD10_C(64, 48)
+PIXEL_SAD10_C(16, 64)
+PIXEL_SAD10_C(48, 64)
+PIXEL_SAD10_C(32, 32)     /* 32x32 */
+PIXEL_SAD10_C(32, 16)
+PIXEL_SAD10_C(16, 32)
+PIXEL_SAD10_C(32,  8)
+PIXEL_SAD10_C(32, 24)
+PIXEL_SAD10_C( 8, 32)
+PIXEL_SAD10_C(24, 32)
+PIXEL_SAD10_C(16, 16)     /* 16x16 */
+PIXEL_SAD10_C(16,  8)
+PIXEL_SAD10_C( 8, 16)
+PIXEL_SAD10_C(16,  4)
+PIXEL_SAD10_C(16, 12)
+PIXEL_SAD10_C( 4, 16)
+PIXEL_SAD10_C(12, 16)
+PIXEL_SAD10_C( 8,  8)     /* 8x8 */
+PIXEL_SAD10_C( 8,  4)
+PIXEL_SAD10_C( 4,  8)
+PIXEL_SAD10_C( 4,  4)     /* 4x4 */
 
 
 /**
@@ -140,8 +185,8 @@ PIXEL_SAD_C( 4,  4)     /* 4x4 */
  * SAD x3
  * ---------------------------------------------------------------------------
  */
-#define PIXEL_SAD_X3_C(w, h) \
-void xavs2_pixel_sad_x3_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, intptr_t i_fref_stride, int32_t* res)\
+#define PIXEL_SAD8_X3_C(w, h) \
+void xavs2_pixel_sad8_x3_##w##x##h(const pel8_t* pix1, const pel8_t* pix2, const pel8_t* pix3, const pel8_t* pix4, intptr_t i_fref_stride, int32_t* res)\
 {\
     int x, y;\
     res[0] = 0;\
@@ -160,31 +205,77 @@ void xavs2_pixel_sad_x3_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pe
     }\
 }
 
-PIXEL_SAD_X3_C(64, 64)  /* 64x64 */
-PIXEL_SAD_X3_C(64, 32)
-PIXEL_SAD_X3_C(32, 64)
-PIXEL_SAD_X3_C(64, 16)
-PIXEL_SAD_X3_C(64, 48)
-PIXEL_SAD_X3_C(16, 64)
-PIXEL_SAD_X3_C(48, 64)
-PIXEL_SAD_X3_C(32, 32)  /* 32x32 */
-PIXEL_SAD_X3_C(32, 16)
-PIXEL_SAD_X3_C(16, 32)
-PIXEL_SAD_X3_C(32,  8)
-PIXEL_SAD_X3_C(32, 24)
-PIXEL_SAD_X3_C( 8, 32)
-PIXEL_SAD_X3_C(24, 32)
-PIXEL_SAD_X3_C(16, 16)  /* 16x16 */
-PIXEL_SAD_X3_C(16,  8)
-PIXEL_SAD_X3_C( 8, 16)
-PIXEL_SAD_X3_C(16,  4)
-PIXEL_SAD_X3_C(16, 12)
-PIXEL_SAD_X3_C( 4, 16)
-PIXEL_SAD_X3_C(12, 16)
-PIXEL_SAD_X3_C( 8,  8)  /* 8x8 */
-PIXEL_SAD_X3_C( 8,  4)
-PIXEL_SAD_X3_C( 4,  8)
-PIXEL_SAD_X3_C( 4,  4)  /* 4x4 */
+PIXEL_SAD8_X3_C(64, 64)  /* 64x64 */
+PIXEL_SAD8_X3_C(64, 32)
+PIXEL_SAD8_X3_C(32, 64)
+PIXEL_SAD8_X3_C(64, 16)
+PIXEL_SAD8_X3_C(64, 48)
+PIXEL_SAD8_X3_C(16, 64)
+PIXEL_SAD8_X3_C(48, 64)
+PIXEL_SAD8_X3_C(32, 32)  /* 32x32 */
+PIXEL_SAD8_X3_C(32, 16)
+PIXEL_SAD8_X3_C(16, 32)
+PIXEL_SAD8_X3_C(32,  8)
+PIXEL_SAD8_X3_C(32, 24)
+PIXEL_SAD8_X3_C( 8, 32)
+PIXEL_SAD8_X3_C(24, 32)
+PIXEL_SAD8_X3_C(16, 16)  /* 16x16 */
+PIXEL_SAD8_X3_C(16,  8)
+PIXEL_SAD8_X3_C( 8, 16)
+PIXEL_SAD8_X3_C(16,  4)
+PIXEL_SAD8_X3_C(16, 12)
+PIXEL_SAD8_X3_C( 4, 16)
+PIXEL_SAD8_X3_C(12, 16)
+PIXEL_SAD8_X3_C( 8,  8)  /* 8x8 */
+PIXEL_SAD8_X3_C( 8,  4)
+PIXEL_SAD8_X3_C( 4,  8)
+PIXEL_SAD8_X3_C( 4,  4)  /* 4x4 */
+
+#define PIXEL_SAD10_X3_C(w, h) \
+void xavs2_pixel_sad10_x3_##w##x##h(const pel10_t* pix1, const pel10_t* pix2, const pel10_t* pix3, const pel10_t* pix4, intptr_t i_fref_stride, int32_t* res)\
+{\
+    int x, y;\
+    res[0] = 0;\
+    res[1] = 0;\
+    res[2] = 0;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            res[0] += abs(pix1[x] - pix2[x]);\
+            res[1] += abs(pix1[x] - pix3[x]);\
+            res[2] += abs(pix1[x] - pix4[x]);\
+        }\
+        pix1 += FENC_STRIDE;\
+        pix2 += i_fref_stride;\
+        pix3 += i_fref_stride;\
+        pix4 += i_fref_stride;\
+    }\
+}
+
+PIXEL_SAD10_X3_C(64, 64)  /* 64x64 */
+PIXEL_SAD10_X3_C(64, 32)
+PIXEL_SAD10_X3_C(32, 64)
+PIXEL_SAD10_X3_C(64, 16)
+PIXEL_SAD10_X3_C(64, 48)
+PIXEL_SAD10_X3_C(16, 64)
+PIXEL_SAD10_X3_C(48, 64)
+PIXEL_SAD10_X3_C(32, 32)  /* 32x32 */
+PIXEL_SAD10_X3_C(32, 16)
+PIXEL_SAD10_X3_C(16, 32)
+PIXEL_SAD10_X3_C(32,  8)
+PIXEL_SAD10_X3_C(32, 24)
+PIXEL_SAD10_X3_C( 8, 32)
+PIXEL_SAD10_X3_C(24, 32)
+PIXEL_SAD10_X3_C(16, 16)  /* 16x16 */
+PIXEL_SAD10_X3_C(16,  8)
+PIXEL_SAD10_X3_C( 8, 16)
+PIXEL_SAD10_X3_C(16,  4)
+PIXEL_SAD10_X3_C(16, 12)
+PIXEL_SAD10_X3_C( 4, 16)
+PIXEL_SAD10_X3_C(12, 16)
+PIXEL_SAD10_X3_C( 8,  8)  /* 8x8 */
+PIXEL_SAD10_X3_C( 8,  4)
+PIXEL_SAD10_X3_C( 4,  8)
+PIXEL_SAD10_X3_C( 4,  4)  /* 4x4 */
 
 
 /**
@@ -193,8 +284,57 @@ PIXEL_SAD_X3_C( 4,  4)  /* 4x4 */
  * ---------------------------------------------------------------------------
  */
 
-#define PIXEL_SAD_X4_C(w, h) \
-void xavs2_pixel_sad_x4_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, const pel_t* pix5, intptr_t i_fref_stride, int32_t* res)\
+#define PIXEL_SAD8_X4_C(w, h) \
+void xavs2_pixel_sad8_x4_##w##x##h(const pel8_t* pix1, const pel8_t* pix2, const pel8_t* pix3, const pel8_t* pix4, const pel8_t* pix5, intptr_t i_fref_stride, int32_t* res)\
+{\
+    int x, y;\
+    res[0] = 0;\
+    res[1] = 0;\
+    res[2] = 0;\
+    res[3] = 0;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            res[0] += abs(pix1[x] - pix2[x]);\
+            res[1] += abs(pix1[x] - pix3[x]);\
+            res[2] += abs(pix1[x] - pix4[x]);\
+            res[3] += abs(pix1[x] - pix5[x]);\
+        }\
+        pix1 += FENC_STRIDE;\
+        pix2 += i_fref_stride;\
+        pix3 += i_fref_stride;\
+        pix4 += i_fref_stride;\
+        pix5 += i_fref_stride;\
+    }\
+}
+
+PIXEL_SAD8_X4_C(64, 64)  /* 64x64 */
+PIXEL_SAD8_X4_C(64, 32)
+PIXEL_SAD8_X4_C(32, 64)
+PIXEL_SAD8_X4_C(64, 16)
+PIXEL_SAD8_X4_C(64, 48)
+PIXEL_SAD8_X4_C(16, 64)
+PIXEL_SAD8_X4_C(48, 64)
+PIXEL_SAD8_X4_C(32, 32)  /* 32x32 */
+PIXEL_SAD8_X4_C(32, 16)
+PIXEL_SAD8_X4_C(16, 32)
+PIXEL_SAD8_X4_C(32,  8)
+PIXEL_SAD8_X4_C(32, 24)
+PIXEL_SAD8_X4_C( 8, 32)
+PIXEL_SAD8_X4_C(24, 32)
+PIXEL_SAD8_X4_C(16, 16)  /* 16x16 */
+PIXEL_SAD8_X4_C(16,  8)
+PIXEL_SAD8_X4_C( 8, 16)
+PIXEL_SAD8_X4_C(16,  4)
+PIXEL_SAD8_X4_C(16, 12)
+PIXEL_SAD8_X4_C( 4, 16)
+PIXEL_SAD8_X4_C(12, 16)
+PIXEL_SAD8_X4_C( 8,  8)  /* 8x8 */
+PIXEL_SAD8_X4_C( 8,  4)
+PIXEL_SAD8_X4_C( 4,  8)
+PIXEL_SAD8_X4_C( 4,  4)  /* 4x4 */
+
+#define PIXEL_SAD10_X4_C(w, h) \
+void xavs2_pixel_sad10_x4_##w##x##h(const pel10_t* pix1, const pel10_t* pix2, const pel10_t* pix3, const pel10_t* pix4, const pel10_t* pix5, intptr_t i_fref_stride, int32_t* res)\
 {\
     int x, y;\
     res[0] = 0;\
@@ -216,31 +356,32 @@ void xavs2_pixel_sad_x4_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pe
     }\
 }
 
-PIXEL_SAD_X4_C(64, 64)  /* 64x64 */
-PIXEL_SAD_X4_C(64, 32)
-PIXEL_SAD_X4_C(32, 64)
-PIXEL_SAD_X4_C(64, 16)
-PIXEL_SAD_X4_C(64, 48)
-PIXEL_SAD_X4_C(16, 64)
-PIXEL_SAD_X4_C(48, 64)
-PIXEL_SAD_X4_C(32, 32)  /* 32x32 */
-PIXEL_SAD_X4_C(32, 16)
-PIXEL_SAD_X4_C(16, 32)
-PIXEL_SAD_X4_C(32,  8)
-PIXEL_SAD_X4_C(32, 24)
-PIXEL_SAD_X4_C( 8, 32)
-PIXEL_SAD_X4_C(24, 32)
-PIXEL_SAD_X4_C(16, 16)  /* 16x16 */
-PIXEL_SAD_X4_C(16,  8)
-PIXEL_SAD_X4_C( 8, 16)
-PIXEL_SAD_X4_C(16,  4)
-PIXEL_SAD_X4_C(16, 12)
-PIXEL_SAD_X4_C( 4, 16)
-PIXEL_SAD_X4_C(12, 16)
-PIXEL_SAD_X4_C( 8,  8)  /* 8x8 */
-PIXEL_SAD_X4_C( 8,  4)
-PIXEL_SAD_X4_C( 4,  8)
-PIXEL_SAD_X4_C( 4,  4)  /* 4x4 */
+PIXEL_SAD10_X4_C(64, 64)  /* 64x64 */
+PIXEL_SAD10_X4_C(64, 32)
+PIXEL_SAD10_X4_C(32, 64)
+PIXEL_SAD10_X4_C(64, 16)
+PIXEL_SAD10_X4_C(64, 48)
+PIXEL_SAD10_X4_C(16, 64)
+PIXEL_SAD10_X4_C(48, 64)
+PIXEL_SAD10_X4_C(32, 32)  /* 32x32 */
+PIXEL_SAD10_X4_C(32, 16)
+PIXEL_SAD10_X4_C(16, 32)
+PIXEL_SAD10_X4_C(32,  8)
+PIXEL_SAD10_X4_C(32, 24)
+PIXEL_SAD10_X4_C( 8, 32)
+PIXEL_SAD10_X4_C(24, 32)
+PIXEL_SAD10_X4_C(16, 16)  /* 16x16 */
+PIXEL_SAD10_X4_C(16,  8)
+PIXEL_SAD10_X4_C( 8, 16)
+PIXEL_SAD10_X4_C(16,  4)
+PIXEL_SAD10_X4_C(16, 12)
+PIXEL_SAD10_X4_C( 4, 16)
+PIXEL_SAD10_X4_C(12, 16)
+PIXEL_SAD10_X4_C( 8,  8)  /* 8x8 */
+PIXEL_SAD10_X4_C( 8,  4)
+PIXEL_SAD10_X4_C( 4,  8)
+PIXEL_SAD10_X4_C( 4,  4)  /* 4x4 */
+//#endif
 
 
 /**
@@ -294,7 +435,34 @@ ALWAYS_INLINE uint64_t abs2_10bit(uint64_t a)
 
 /* ---------------------------------------------------------------------------
  */
-static cmp_dist_t xavs2_pixel_satd_4x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)
+static cmp_dist_t xavs2_pixel_satd8_4x4(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)
+{
+    uint32_t tmp[4][2];
+    uint32_t a0, a1, a2, a3, b0, b1;
+    cmp_dist_t sum = 0;
+    int i;
+
+    for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) {
+        a0 = pix1[0] - pix2[0];
+        a1 = pix1[1] - pix2[1];
+        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+        a2 = pix1[2] - pix2[2];
+        a3 = pix1[3] - pix2[3];
+        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+        tmp[i][0] = b0 + b1;
+        tmp[i][1] = b0 - b1;
+    }
+
+    for (i = 0; i < 2; i++) {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+        sum += ((uint16_t)a0) + (a0 >> BITS_PER_SUM);
+    }
+
+    return (sum >> 1);
+}
+
+static cmp_dist_t xavs2_pixel_satd10_4x4(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)
 {
     uint32_t tmp[4][2];
     uint32_t a0, a1, a2, a3, b0, b1;
@@ -324,7 +492,30 @@ static cmp_dist_t xavs2_pixel_satd_4x4(const pel_t *pix1, intptr_t i_pix1, const
 /* ---------------------------------------------------------------------------
  * SWAR version of satd 8x4, performs two 4x4 SATDs at once
  */
-static cmp_dist_t xavs2_pixel_satd_8x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)
+static cmp_dist_t xavs2_pixel_satd8_8x4(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)
+{
+    uint32_t tmp[4][4];
+    uint32_t a0, a1, a2, a3;
+    cmp_dist_t sum = 0;
+    int i;
+
+    for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) {
+        a0 = (pix1[0] - pix2[0]) + ((uint32_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
+        a1 = (pix1[1] - pix2[1]) + ((uint32_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
+        a2 = (pix1[2] - pix2[2]) + ((uint32_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
+        a3 = (pix1[3] - pix2[3]) + ((uint32_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
+        HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
+    }
+
+    for (i = 0; i < 4; i++) {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+    }
+
+    return (((uint16_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
+}
+
+static cmp_dist_t xavs2_pixel_satd10_8x4(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)
 {
     uint32_t tmp[4][4];
     uint32_t a0, a1, a2, a3;
@@ -351,14 +542,28 @@ static cmp_dist_t xavs2_pixel_satd_8x4(const pel_t *pix1, intptr_t i_pix1, const
 /* ---------------------------------------------------------------------------
  * calculate satd in blocks of 4x4
  */
-#define PIXEL_SATD4_C(w, h) \
-static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SATD8_4_C(w, h) \
+static cmp_dist_t xavs2_pixel_satd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
+{\
+    cmp_dist_t satd = 0;\
+    int y, x;\
+    for (y = 0; y < h; y += 4) {\
+        for (x = 0; x < w; x += 4) {\
+            satd += xavs2_pixel_satd8_4x4(pix1 + y * i_pix1 + x, i_pix1,\
+                                       pix2 + y * i_pix2 + x, i_pix2);\
+        }\
+    }\
+    return satd;\
+}
+
+#define PIXEL_SATD10_4_C(w, h) \
+static cmp_dist_t xavs2_pixel_satd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
 {\
     cmp_dist_t satd = 0;\
     int y, x;\
     for (y = 0; y < h; y += 4) {\
         for (x = 0; x < w; x += 4) {\
-            satd += xavs2_pixel_satd_4x4(pix1 + y * i_pix1 + x, i_pix1,\
+            satd += xavs2_pixel_satd10_4x4(pix1 + y * i_pix1 + x, i_pix1,\
                                        pix2 + y * i_pix2 + x, i_pix2);\
         }\
     }\
@@ -368,43 +573,81 @@ static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1,
 /* ---------------------------------------------------------------------------
  * calculate satd in blocks of 8x4
  */
-#define PIXEL_SATD8_C(w, h) \
-static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SATD8_8_C(w, h) \
+static cmp_dist_t xavs2_pixel_satd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
 {\
     cmp_dist_t satd = 0;\
     int y, x;\
     for (y = 0; y < h; y += 4) {\
         for (x = 0; x < w; x += 8) {\
-            satd += xavs2_pixel_satd_8x4(pix1 + y * i_pix1 + x, i_pix1,\
+            satd += xavs2_pixel_satd8_8x4(pix1 + y * i_pix1 + x, i_pix1,\
                                        pix2 + y * i_pix2 + x, i_pix2);\
         }\
     }\
     return satd;\
 }
 
-PIXEL_SATD8_C(64, 64) /* 64x64 */
-PIXEL_SATD8_C(64, 32)
-PIXEL_SATD8_C(32, 64)
-PIXEL_SATD8_C(64, 16)
-PIXEL_SATD8_C(64, 48)
-PIXEL_SATD8_C(16, 64)
-PIXEL_SATD8_C(48, 64)
-PIXEL_SATD8_C(32, 32) /* 32x32 */
-PIXEL_SATD8_C(32, 16)
-PIXEL_SATD8_C(16, 32)
-PIXEL_SATD8_C(32,  8)
-PIXEL_SATD8_C(32, 24)
-PIXEL_SATD8_C( 8, 32)
-PIXEL_SATD8_C(24, 32)
-PIXEL_SATD8_C(16, 16) /* 16x16 */
-PIXEL_SATD8_C(16,  8)
-PIXEL_SATD8_C( 8, 16)
-PIXEL_SATD8_C(16,  4)
-PIXEL_SATD8_C(16, 12)
-PIXEL_SATD4_C( 4, 16)
-PIXEL_SATD4_C(12, 16)
-PIXEL_SATD8_C( 8,  8) /* 8x8 */
-PIXEL_SATD4_C( 4,  8)
+PIXEL_SATD8_8_C(64, 64) /* 64x64 */
+PIXEL_SATD8_8_C(64, 32)
+PIXEL_SATD8_8_C(32, 64)
+PIXEL_SATD8_8_C(64, 16)
+PIXEL_SATD8_8_C(64, 48)
+PIXEL_SATD8_8_C(16, 64)
+PIXEL_SATD8_8_C(48, 64)
+PIXEL_SATD8_8_C(32, 32) /* 32x32 */
+PIXEL_SATD8_8_C(32, 16)
+PIXEL_SATD8_8_C(16, 32)
+PIXEL_SATD8_8_C(32,  8)
+PIXEL_SATD8_8_C(32, 24)
+PIXEL_SATD8_8_C( 8, 32)
+PIXEL_SATD8_8_C(24, 32)
+PIXEL_SATD8_8_C(16, 16) /* 16x16 */
+PIXEL_SATD8_8_C(16,  8)
+PIXEL_SATD8_8_C( 8, 16)
+PIXEL_SATD8_8_C(16,  4)
+PIXEL_SATD8_8_C(16, 12)
+PIXEL_SATD8_4_C( 4, 16)
+PIXEL_SATD8_4_C(12, 16)
+PIXEL_SATD8_8_C( 8,  8) /* 8x8 */
+PIXEL_SATD8_4_C( 4,  8)
+
+#define PIXEL_SATD10_8_C(w, h) \
+static cmp_dist_t xavs2_pixel_satd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
+{\
+    cmp_dist_t satd = 0;\
+    int y, x;\
+    for (y = 0; y < h; y += 4) {\
+        for (x = 0; x < w; x += 8) {\
+            satd += xavs2_pixel_satd10_8x4(pix1 + y * i_pix1 + x, i_pix1,\
+                                       pix2 + y * i_pix2 + x, i_pix2);\
+        }\
+    }\
+    return satd;\
+}
+
+PIXEL_SATD10_8_C(64, 64) /* 64x64 */
+PIXEL_SATD10_8_C(64, 32)
+PIXEL_SATD10_8_C(32, 64)
+PIXEL_SATD10_8_C(64, 16)
+PIXEL_SATD10_8_C(64, 48)
+PIXEL_SATD10_8_C(16, 64)
+PIXEL_SATD10_8_C(48, 64)
+PIXEL_SATD10_8_C(32, 32) /* 32x32 */
+PIXEL_SATD10_8_C(32, 16)
+PIXEL_SATD10_8_C(16, 32)
+PIXEL_SATD10_8_C(32,  8)
+PIXEL_SATD10_8_C(32, 24)
+PIXEL_SATD10_8_C( 8, 32)
+PIXEL_SATD10_8_C(24, 32)
+PIXEL_SATD10_8_C(16, 16) /* 16x16 */
+PIXEL_SATD10_8_C(16,  8)
+PIXEL_SATD10_8_C( 8, 16)
+PIXEL_SATD10_8_C(16,  4)
+PIXEL_SATD10_8_C(16, 12)
+PIXEL_SATD10_4_C( 4, 16)
+PIXEL_SATD10_4_C(12, 16)
+PIXEL_SATD10_8_C( 8,  8) /* 8x8 */
+PIXEL_SATD10_4_C( 4,  8)
 
 
 /**
@@ -413,7 +656,42 @@ PIXEL_SATD4_C( 4,  8)
  * ---------------------------------------------------------------------------
  */
 
-int _sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2)
+int _sa8d8_8x8(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2)
+{
+    sum2_t tmp[8][4];
+    sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
+    sum2_t sum = 0;
+
+    for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) {
+        a0 = pix1[0] - pix2[0];
+        a1 = pix1[1] - pix2[1];
+        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+        a2 = pix1[2] - pix2[2];
+        a3 = pix1[3] - pix2[3];
+        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+        a4 = pix1[4] - pix2[4];
+        a5 = pix1[5] - pix2[5];
+        b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
+        a6 = pix1[6] - pix2[6];
+        a7 = pix1[7] - pix2[7];
+        b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
+        HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
+    }
+
+    for (int i = 0; i < 4; i++) {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
+        b0  = abs2(a0 + a4) + abs2(a0 - a4);
+        b0 += abs2(a1 + a5) + abs2(a1 - a5);
+        b0 += abs2(a2 + a6) + abs2(a2 - a6);
+        b0 += abs2(a3 + a7) + abs2(a3 - a7);
+        sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
+    }
+
+    return (cmp_dist_t)sum;
+}
+
+int _sa8d10_8x8(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2)
 {
     sum2_t tmp[8][4];
     sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -451,20 +729,39 @@ int _sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_
 /* ---------------------------------------------------------------------------
  */
 static
-cmp_dist_t xavs2_pixel_sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2)
+cmp_dist_t xavs2_pixel_sa8d8_8x8(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2)
 {
-    return (cmp_dist_t)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
+    return (cmp_dist_t)((_sa8d8_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
+}
+
+static
+cmp_dist_t xavs2_pixel_sa8d10_8x8(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2)
+{
+    return (cmp_dist_t)((_sa8d10_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
 /* ---------------------------------------------------------------------------
  */
 static
-cmp_dist_t xavs2_pixel_sa8d_16x16(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2)
+cmp_dist_t xavs2_pixel_sa8d8_16x16(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2)
+{
+    cmp_dist_t sum = _sa8d8_8x8(pix1, i_pix1, pix2, i_pix2)
+                     + _sa8d8_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
+                     + _sa8d8_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
+                     + _sa8d8_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
+
+    // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
+    // this version only rounds once at the end
+    return (sum + 2) >> 2;
+}
+
+static
+cmp_dist_t xavs2_pixel_sa8d10_16x16(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2)
 {
-    cmp_dist_t sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
-                     + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
-                     + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
-                     + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
+    cmp_dist_t sum = _sa8d10_8x8(pix1, i_pix1, pix2, i_pix2)
+                     + _sa8d10_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
+                     + _sa8d10_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
+                     + _sa8d10_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
 
     // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
     // this version only rounds once at the end
@@ -474,14 +771,28 @@ cmp_dist_t xavs2_pixel_sa8d_16x16(const pel_t* pix1, intptr_t i_pix1, const pel_
 /* ---------------------------------------------------------------------------
  * calculate sa8d in blocks of 8x8
  */
-#define PIXEL_SA8D_C8(w, h) \
-static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SA8D8_C8(w, h) \
+static cmp_dist_t xavs2_pixel_sa8d8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
 {\
     cmp_dist_t sa8d = 0;\
     int y, x;\
     for (y = 0; y < h; y += 8) {\
         for (x = 0; x < w; x += 8) {\
-            sa8d += xavs2_pixel_sa8d_8x8(pix1 + y * i_pix1 + x, i_pix1,\
+            sa8d += xavs2_pixel_sa8d8_8x8(pix1 + y * i_pix1 + x, i_pix1,\
+                                         pix2 + y * i_pix2 + x, i_pix2);\
+        }\
+    }\
+    return sa8d;\
+}
+
+#define PIXEL_SA8D10_C8(w, h) \
+static cmp_dist_t xavs2_pixel_sa8d10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
+{\
+    cmp_dist_t sa8d = 0;\
+    int y, x;\
+    for (y = 0; y < h; y += 8) {\
+        for (x = 0; x < w; x += 8) {\
+            sa8d += xavs2_pixel_sa8d10_8x8(pix1 + y * i_pix1 + x, i_pix1,\
                                          pix2 + y * i_pix2 + x, i_pix2);\
         }\
     }\
@@ -491,50 +802,104 @@ static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1,
 /* ---------------------------------------------------------------------------
  * calculate sa8d in blocks of 16x16
  */
-#define PIXEL_SA8D_C16(w, h) \
-static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SA8D8_C16(w, h) \
+static cmp_dist_t xavs2_pixel_sa8d8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
 {\
     cmp_dist_t sa8d = 0;\
     int y, x;\
     for (y = 0; y < h; y += 16) {\
         for (x = 0; x < w; x += 16) {\
-            sa8d += xavs2_pixel_sa8d_16x16(pix1 + y * i_pix1 + x, i_pix1,\
+            sa8d += xavs2_pixel_sa8d8_16x16(pix1 + y * i_pix1 + x, i_pix1,\
                                            pix2 + y * i_pix2 + x, i_pix2);\
         }\
     }\
     return sa8d;\
 }
 
-#define xavs2_pixel_sa8d_4x4    xavs2_pixel_satd_4x4
-#define xavs2_pixel_sa8d_4x8    xavs2_pixel_satd_4x8
-#define xavs2_pixel_sa8d_8x4    xavs2_pixel_satd_8x4
-#define xavs2_pixel_sa8d_16x4   xavs2_pixel_satd_16x4
-#define xavs2_pixel_sa8d_4x16   xavs2_pixel_satd_4x16
-#define xavs2_pixel_sa8d_12x16  xavs2_pixel_satd_12x16
-#define xavs2_pixel_sa8d_16x12  xavs2_pixel_satd_16x12
-PIXEL_SA8D_C8(8, 16)
-PIXEL_SA8D_C8(8, 32)
-PIXEL_SA8D_C8(16, 8)
-PIXEL_SA8D_C8(32, 8)
-PIXEL_SA8D_C16(32, 16)
-PIXEL_SA8D_C8(32, 24)
-PIXEL_SA8D_C8(24, 32)
-PIXEL_SA8D_C16(32, 32)
-PIXEL_SA8D_C16(16, 32)
-PIXEL_SA8D_C16(64, 16)
-PIXEL_SA8D_C16(64, 32)
-PIXEL_SA8D_C16(64, 48)
-PIXEL_SA8D_C16(16, 64)
-PIXEL_SA8D_C16(32, 64)
-PIXEL_SA8D_C16(48, 64)
-PIXEL_SA8D_C16(64, 64)
+#define xavs2_pixel_sa8d8_4x4    xavs2_pixel_satd8_4x4
+#define xavs2_pixel_sa8d8_4x8    xavs2_pixel_satd8_4x8
+#define xavs2_pixel_sa8d8_8x4    xavs2_pixel_satd8_8x4
+#define xavs2_pixel_sa8d8_16x4   xavs2_pixel_satd8_16x4
+#define xavs2_pixel_sa8d8_4x16   xavs2_pixel_satd8_4x16
+#define xavs2_pixel_sa8d8_12x16  xavs2_pixel_satd8_12x16
+#define xavs2_pixel_sa8d8_16x12  xavs2_pixel_satd8_16x12
+PIXEL_SA8D8_C8(8, 16)
+PIXEL_SA8D8_C8(8, 32)
+PIXEL_SA8D8_C8(16, 8)
+PIXEL_SA8D8_C8(32, 8)
+PIXEL_SA8D8_C16(32, 16)
+PIXEL_SA8D8_C8(32, 24)
+PIXEL_SA8D8_C8(24, 32)
+PIXEL_SA8D8_C16(32, 32)
+PIXEL_SA8D8_C16(16, 32)
+PIXEL_SA8D8_C16(64, 16)
+PIXEL_SA8D8_C16(64, 32)
+PIXEL_SA8D8_C16(64, 48)
+PIXEL_SA8D8_C16(16, 64)
+PIXEL_SA8D8_C16(32, 64)
+PIXEL_SA8D8_C16(48, 64)
+PIXEL_SA8D8_C16(64, 64)
+
+#define PIXEL_SA8D10_C16(w, h) \
+static cmp_dist_t xavs2_pixel_sa8d10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
+{\
+    cmp_dist_t sa8d = 0;\
+    int y, x;\
+    for (y = 0; y < h; y += 16) {\
+        for (x = 0; x < w; x += 16) {\
+            sa8d += xavs2_pixel_sa8d10_16x16(pix1 + y * i_pix1 + x, i_pix1,\
+                                           pix2 + y * i_pix2 + x, i_pix2);\
+        }\
+    }\
+    return sa8d;\
+}
+
+#define xavs2_pixel_sa8d10_4x4    xavs2_pixel_satd10_4x4
+#define xavs2_pixel_sa8d10_4x8    xavs2_pixel_satd10_4x8
+#define xavs2_pixel_sa8d10_8x4    xavs2_pixel_satd10_8x4
+#define xavs2_pixel_sa8d10_16x4   xavs2_pixel_satd10_16x4
+#define xavs2_pixel_sa8d10_4x16   xavs2_pixel_satd10_4x16
+#define xavs2_pixel_sa8d10_12x16  xavs2_pixel_satd10_12x16
+#define xavs2_pixel_sa8d10_16x12  xavs2_pixel_satd10_16x12
+PIXEL_SA8D10_C8(8, 16)
+PIXEL_SA8D10_C8(8, 32)
+PIXEL_SA8D10_C8(16, 8)
+PIXEL_SA8D10_C8(32, 8)
+PIXEL_SA8D10_C16(32, 16)
+PIXEL_SA8D10_C8(32, 24)
+PIXEL_SA8D10_C8(24, 32)
+PIXEL_SA8D10_C16(32, 32)
+PIXEL_SA8D10_C16(16, 32)
+PIXEL_SA8D10_C16(64, 16)
+PIXEL_SA8D10_C16(64, 32)
+PIXEL_SA8D10_C16(64, 48)
+PIXEL_SA8D10_C16(16, 64)
+PIXEL_SA8D10_C16(32, 64)
+PIXEL_SA8D10_C16(48, 64)
+PIXEL_SA8D10_C16(64, 64)
+
 
 /**
  * ---------------------------------------------------------------------------
  * SSD
  * ---------------------------------------------------------------------------
  */
-dist_t xavs2_get_block_ssd_c(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height)
+dist_t xavs2_get_block_ssd8_c(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2, int width, int height)
+{
+    dist_t sum = 0;
+    int x, y, tmp;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            tmp = pix1[x] - pix2[x];
+            sum += (tmp * tmp);
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+    return sum;
+}
+
+dist_t xavs2_get_block_ssd10_c(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2, int width, int height)
 {
     dist_t sum = 0;
     int x, y, tmp;
@@ -549,8 +914,50 @@ dist_t xavs2_get_block_ssd_c(const pel_t *pix1, intptr_t i_pix1, const pel_t *pi
     return sum;
 }
 
-#define PIXEL_SSD_C(w, h) \
-static dist_t xavs2_pixel_ssd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\
+#define PIXEL_SSD8_C(w, h) \
+static dist_t xavs2_pixel_ssd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\
+{\
+    dist_t sum = 0;\
+    int x, y, tmp;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            tmp  = pix1[x] - pix2[x];\
+            sum += (tmp * tmp);\
+        }\
+        pix1 += i_pix1;\
+        pix2 += i_pix2;\
+    }\
+    return sum;\
+}
+
+PIXEL_SSD8_C(64, 64)  /* 64x64 */
+PIXEL_SSD8_C(64, 32)
+PIXEL_SSD8_C(32, 64)
+PIXEL_SSD8_C(64, 16)
+PIXEL_SSD8_C(64, 48)
+PIXEL_SSD8_C(16, 64)
+PIXEL_SSD8_C(48, 64)
+PIXEL_SSD8_C(32, 32)  /* 32x32 */
+PIXEL_SSD8_C(32, 16)
+PIXEL_SSD8_C(16, 32)
+PIXEL_SSD8_C(32,  8)
+PIXEL_SSD8_C(32, 24)
+PIXEL_SSD8_C( 8, 32)
+PIXEL_SSD8_C(24, 32)
+PIXEL_SSD8_C(16, 16)  /* 16x16 */
+PIXEL_SSD8_C(16,  8)
+PIXEL_SSD8_C( 8, 16)
+PIXEL_SSD8_C(16,  4)
+PIXEL_SSD8_C(16, 12)
+PIXEL_SSD8_C( 4, 16)
+PIXEL_SSD8_C(12, 16)
+PIXEL_SSD8_C( 8,  8)  /* 8x8 */
+PIXEL_SSD8_C( 8,  4)
+PIXEL_SSD8_C( 4,  8)
+PIXEL_SSD8_C( 4,  4)  /* 4x4 */
+
+#define PIXEL_SSD10_C(w, h) \
+static dist_t xavs2_pixel_ssd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\
 {\
     dist_t sum = 0;\
     int x, y, tmp;\
@@ -565,46 +972,46 @@ static dist_t xavs2_pixel_ssd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, cons
     return sum;\
 }
 
-PIXEL_SSD_C(64, 64)  /* 64x64 */
-PIXEL_SSD_C(64, 32)
-PIXEL_SSD_C(32, 64)
-PIXEL_SSD_C(64, 16)
-PIXEL_SSD_C(64, 48)
-PIXEL_SSD_C(16, 64)
-PIXEL_SSD_C(48, 64)
-PIXEL_SSD_C(32, 32)  /* 32x32 */
-PIXEL_SSD_C(32, 16)
-PIXEL_SSD_C(16, 32)
-PIXEL_SSD_C(32,  8)
-PIXEL_SSD_C(32, 24)
-PIXEL_SSD_C( 8, 32)
-PIXEL_SSD_C(24, 32)
-PIXEL_SSD_C(16, 16)  /* 16x16 */
-PIXEL_SSD_C(16,  8)
-PIXEL_SSD_C( 8, 16)
-PIXEL_SSD_C(16,  4)
-PIXEL_SSD_C(16, 12)
-PIXEL_SSD_C( 4, 16)
-PIXEL_SSD_C(12, 16)
-PIXEL_SSD_C( 8,  8)  /* 8x8 */
-PIXEL_SSD_C( 8,  4)
-PIXEL_SSD_C( 4,  8)
-PIXEL_SSD_C( 4,  4)  /* 4x4 */
+PIXEL_SSD10_C(64, 64)  /* 64x64 */
+PIXEL_SSD10_C(64, 32)
+PIXEL_SSD10_C(32, 64)
+PIXEL_SSD10_C(64, 16)
+PIXEL_SSD10_C(64, 48)
+PIXEL_SSD10_C(16, 64)
+PIXEL_SSD10_C(48, 64)
+PIXEL_SSD10_C(32, 32)  /* 32x32 */
+PIXEL_SSD10_C(32, 16)
+PIXEL_SSD10_C(16, 32)
+PIXEL_SSD10_C(32,  8)
+PIXEL_SSD10_C(32, 24)
+PIXEL_SSD10_C( 8, 32)
+PIXEL_SSD10_C(24, 32)
+PIXEL_SSD10_C(16, 16)  /* 16x16 */
+PIXEL_SSD10_C(16,  8)
+PIXEL_SSD10_C( 8, 16)
+PIXEL_SSD10_C(16,  4)
+PIXEL_SSD10_C(16, 12)
+PIXEL_SSD10_C( 4, 16)
+PIXEL_SSD10_C(12, 16)
+PIXEL_SSD10_C( 8,  8)  /* 8x8 */
+PIXEL_SSD10_C( 8,  4)
+PIXEL_SSD10_C( 4,  8)
+PIXEL_SSD10_C( 4,  4)  /* 4x4 */
 
 /* ---------------------------------------------------------------------------
  * ssd for one plane of frame
  */
 #if XAVS2_STAT
-uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf,
-                             pel_t *p_pix1, intptr_t i_pix1,
-                             pel_t *p_pix2, intptr_t i_pix2,
+uint64_t xavs2_pixel_ssd8_wxh(pixel_funcs_t *pf,
+                             pel8_t *p_pix1, intptr_t i_pix1,
+                             pel8_t *p_pix2, intptr_t i_pix2,
                              int i_width, int i_height,
                              int inout_shift)
 {
     uint64_t i_ssd = 0;
     int align = !(((intptr_t)p_pix1 | (intptr_t)p_pix2 | i_pix1 | i_pix2) & 15);
     int x, y;
-    pixel_ssd_t cal_ssd[2];
+    pixel8_ssd_t cal_ssd[2];
 
     if (inout_shift > 0) {
         int inout_offset = 1 << (inout_shift - 1);
@@ -618,8 +1025,8 @@ uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf,
             p_pix2 += i_pix2;
         }
     } else {
-        cal_ssd[0] = pf->ssd[LUMA_8x8];  /*  8 x  8 */
-        cal_ssd[1] = pf->ssd[LUMA_16x16];  /* 16 x 16 */
+        cal_ssd[0] = pf->ssd8[LUMA_8x8];  /*  8 x  8 */
+        cal_ssd[1] = pf->ssd8[LUMA_16x16];  /* 16 x 16 */
 
 #define SSD(id) i_ssd += cal_ssd[id](p_pix1 + y*i_pix1 + x, i_pix1, p_pix2 + y*i_pix2 + x, i_pix2)
 
@@ -668,17 +1075,132 @@ uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf,
 
     return i_ssd;
 }
-#endif
 
+uint64_t xavs2_pixel_ssd10_wxh(pixel_funcs_t *pf,
+                             pel10_t *p_pix1, intptr_t i_pix1,
+                             pel10_t *p_pix2, intptr_t i_pix2,
+                             int i_width, int i_height,
+                             int inout_shift)
+{
+    uint64_t i_ssd = 0;
+    int align = !(((intptr_t)p_pix1 | (intptr_t)p_pix2 | i_pix1 | i_pix2) & 15);
+    int x, y;
+    pixel10_ssd_t cal_ssd[2];
+
+    if (inout_shift > 0) {
+        int inout_offset = 1 << (inout_shift - 1);
 
+        for (y = 0; y < i_height; y++) {
+            for (x = 0; x < i_width; x++) {
+                int d = ((p_pix1[x] + inout_offset) >> inout_shift) - ((p_pix2[x] + inout_offset) >> inout_shift);
+                i_ssd += d * d;
+            }
+            p_pix1 += i_pix1;
+            p_pix2 += i_pix2;
+        }
+    } else {
+        cal_ssd[0] = pf->ssd10[LUMA_8x8];  /*  8 x  8 */
+        cal_ssd[1] = pf->ssd10[LUMA_16x16];  /* 16 x 16 */
+
+#define SSD(id) i_ssd += cal_ssd[id](p_pix1 + y*i_pix1 + x, i_pix1, p_pix2 + y*i_pix2 + x, i_pix2)
+
+        for (y = 0; y < i_height - 15;) {
+            if (align) {
+                for (x = 0; x < i_width - 15; x += 16) {
+                    SSD(1);         /* 16x16 */
+                }
+                y += 16;
+            } else {
+                for (x = 0; x < i_width - 7; x += 8) {
+                    SSD(0);         /* 8x8 */
+                }
+                y += 8;
+                for (x = 0; x < i_width - 7; x += 8) {
+                    SSD(0);         /* 8x8 */
+                }
+                y += 8;
+            }
+        }
+        if (y < i_height - 7) {
+            for (x = 0; x < i_width - 7; x += 8) {
+                SSD(0);             /* 8x8 */
+            }
+        }
+#undef SSD
+
+        /* sum the rest ssd */
+#define SSD1    { int d = p_pix1[y*i_pix1+x] - p_pix2[y*i_pix2+x]; i_ssd += d*d; }
+        if (i_width & 7) {
+            for (y = 0; y < (i_height & ~7); y++) {
+                for (x = i_width & ~7; x < i_width; x++) {
+                    SSD1;
+                }
+            }
+        }
+        if (i_height & 7) {
+            for (y = i_height & ~7; y < i_height; y++) {
+                for (x = 0; x < i_width; x++) {
+                    SSD1;
+                }
+            }
+        }
+#undef SSD1
+    }
+
+    return i_ssd;
+}
+#endif
+
+//#if !HIGH_BIT_DEPTH
 /**
  * ---------------------------------------------------------------------------
  * AVG
  * ---------------------------------------------------------------------------
  */
 
-#define PIXEL_AVG_C(w, h) \
-static void xavs2_pixel_avg_##w##x##h(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight)\
+#define PIXEL_AVG8_C(w, h) \
+static void xavs2_pixel_avg8_##w##x##h(pel8_t* dst, intptr_t dstride, const pel8_t* src0, intptr_t sstride0, const pel8_t* src1, intptr_t sstride1, int weight)\
+{\
+    int x, y;\
+    UNUSED_PARAMETER(weight); \
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            dst[x] = (src0[x] + src1[x] + 1) >> 1;\
+        }\
+        dst  += dstride;\
+        src0 += sstride0;\
+        src1 += sstride1;\
+    }\
+}
+
+PIXEL_AVG8_C(64, 64) /* 64x64 */
+PIXEL_AVG8_C(64, 32)
+PIXEL_AVG8_C(32, 64)
+PIXEL_AVG8_C(64, 16)
+PIXEL_AVG8_C(64, 48)
+PIXEL_AVG8_C(16, 64)
+PIXEL_AVG8_C(48, 64)
+PIXEL_AVG8_C(32, 32) /* 32x32 */
+PIXEL_AVG8_C(32, 16)
+PIXEL_AVG8_C(16, 32)
+PIXEL_AVG8_C(32,  8)
+PIXEL_AVG8_C(32, 24)
+PIXEL_AVG8_C( 8, 32)
+PIXEL_AVG8_C(24, 32)
+PIXEL_AVG8_C(16, 16) /* 16x16 */
+PIXEL_AVG8_C(16,  8)
+PIXEL_AVG8_C( 8, 16)
+PIXEL_AVG8_C(16,  4)
+PIXEL_AVG8_C(16, 12)
+PIXEL_AVG8_C( 4, 16)
+PIXEL_AVG8_C(12, 16)
+PIXEL_AVG8_C( 8,  8) /* 8x8 */
+PIXEL_AVG8_C( 8,  4)
+PIXEL_AVG8_C( 4,  8)
+PIXEL_AVG8_C( 4,  4) /* 4x4 */
+
+#define PIXEL_AVG10_C(w, h) \
+static void xavs2_pixel_avg10_##w##x##h(pel10_t* dst, intptr_t dstride, const pel10_t* src0, intptr_t sstride0, const pel10_t* src1, intptr_t sstride1, int weight)\
 {\
     int x, y;\
     UNUSED_PARAMETER(weight); \
@@ -692,31 +1214,32 @@ static void xavs2_pixel_avg_##w##x##h(pel_t* dst, intptr_t dstride, const pel_t*
     }\
 }
 
-PIXEL_AVG_C(64, 64) /* 64x64 */
-PIXEL_AVG_C(64, 32)
-PIXEL_AVG_C(32, 64)
-PIXEL_AVG_C(64, 16)
-PIXEL_AVG_C(64, 48)
-PIXEL_AVG_C(16, 64)
-PIXEL_AVG_C(48, 64)
-PIXEL_AVG_C(32, 32) /* 32x32 */
-PIXEL_AVG_C(32, 16)
-PIXEL_AVG_C(16, 32)
-PIXEL_AVG_C(32,  8)
-PIXEL_AVG_C(32, 24)
-PIXEL_AVG_C( 8, 32)
-PIXEL_AVG_C(24, 32)
-PIXEL_AVG_C(16, 16) /* 16x16 */
-PIXEL_AVG_C(16,  8)
-PIXEL_AVG_C( 8, 16)
-PIXEL_AVG_C(16,  4)
-PIXEL_AVG_C(16, 12)
-PIXEL_AVG_C( 4, 16)
-PIXEL_AVG_C(12, 16)
-PIXEL_AVG_C( 8,  8) /* 8x8 */
-PIXEL_AVG_C( 8,  4)
-PIXEL_AVG_C( 4,  8)
-PIXEL_AVG_C( 4,  4) /* 4x4 */
+PIXEL_AVG10_C(64, 64) /* 64x64 */
+PIXEL_AVG10_C(64, 32)
+PIXEL_AVG10_C(32, 64)
+PIXEL_AVG10_C(64, 16)
+PIXEL_AVG10_C(64, 48)
+PIXEL_AVG10_C(16, 64)
+PIXEL_AVG10_C(48, 64)
+PIXEL_AVG10_C(32, 32) /* 32x32 */
+PIXEL_AVG10_C(32, 16)
+PIXEL_AVG10_C(16, 32)
+PIXEL_AVG10_C(32,  8)
+PIXEL_AVG10_C(32, 24)
+PIXEL_AVG10_C( 8, 32)
+PIXEL_AVG10_C(24, 32)
+PIXEL_AVG10_C(16, 16) /* 16x16 */
+PIXEL_AVG10_C(16,  8)
+PIXEL_AVG10_C( 8, 16)
+PIXEL_AVG10_C(16,  4)
+PIXEL_AVG10_C(16, 12)
+PIXEL_AVG10_C( 4, 16)
+PIXEL_AVG10_C(12, 16)
+PIXEL_AVG10_C( 8,  8) /* 8x8 */
+PIXEL_AVG10_C( 8,  4)
+PIXEL_AVG10_C( 4,  8)
+PIXEL_AVG10_C( 4,  4) /* 4x4 */
+//#endif
 
 
 /**
@@ -724,8 +1247,8 @@ PIXEL_AVG_C( 4,  4) /* 4x4 */
  * block operation: copy/add/sub (p: pixel, s: short)
  * ---------------------------------------------------------------------------
  */
-#define BLOCKCOPY_PP_C(w, h) \
-static void xavs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\
+#define BLOCKCOPY_PP8_C(w, h) \
+static void xavs2_blockcopy_pp8_##w##x##h(pel8_t *a, intptr_t stridea, const pel8_t *b, intptr_t strideb)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
@@ -737,8 +1260,8 @@ static void xavs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t
     }\
 }
 
-#define BLOCKCOPY_SS_C(w, h) \
-static void xavs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
+#define BLOCKCOPY_PP10_C(w, h) \
+static void xavs2_blockcopy_pp10_##w##x##h(pel10_t *a, intptr_t stridea, const pel10_t *b, intptr_t strideb)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
@@ -750,22 +1273,62 @@ static void xavs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coe
     }\
 }
 
-#define BLOCKCOPY_SP_C(w, h) \
-static void xavs2_blockcopy_sp_##w##x##h(pel_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
+#define BLOCKCOPY_SS8_C(w, h) \
+static void xavs2_blockcopy_ss8_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            a[x] = b[x];\
+        }\
+        a += stridea;\
+        b += strideb;\
+    }\
+}
+
+#define BLOCKCOPY_SS10_C(w, h) \
+static void xavs2_blockcopy_ss10_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            a[x] = b[x];\
+        }\
+        a += stridea;\
+        b += strideb;\
+    }\
+}
+
+#define BLOCKCOPY_SP8_C(w, h) \
+static void xavs2_blockcopy_sp8_##w##x##h(pel8_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            assert((b[x] >= 0) && (b[x] <= ((1 << 8) - 1)));\
+            a[x] = (pel8_t)b[x];\
+        }\
+        a += stridea;\
+        b += strideb;\
+    }\
+}
+
+#define BLOCKCOPY_SP10_C(w, h) \
+static void xavs2_blockcopy_sp10_##w##x##h(pel10_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
         for (x = 0; x < w; x++) {\
             assert((b[x] >= 0) && (b[x] <= ((1 << 8) - 1)));\
-            a[x] = (pel_t)b[x];\
+            a[x] = (pel10_t)b[x];\
         }\
         a += stridea;\
         b += strideb;\
     }\
 }
 
-#define BLOCKCOPY_PS_C(w, h) \
-static void xavs2_blockcopy_ps_##w##x##h(coeff_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\
+#define BLOCKCOPY_PS8_C(w, h) \
+static void xavs2_blockcopy_ps8_##w##x##h(coeff_t *a, intptr_t stridea, const pel8_t *b, intptr_t strideb)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
@@ -775,10 +1338,37 @@ static void xavs2_blockcopy_ps_##w##x##h(coeff_t *a, intptr_t stridea, const pel
         a += stridea;\
         b += strideb;\
     }\
-}\
- 
-#define PIXEL_SUB_PS_C(w, h) \
-static void xavs2_pixel_sub_ps_##w##x##h(coeff_t *a, intptr_t dstride, const pel_t *b0, const pel_t *b1, intptr_t sstride0, intptr_t sstride1)\
+}
+
+#define BLOCKCOPY_PS10_C(w, h) \
+static void xavs2_blockcopy_ps10_##w##x##h(coeff_t *a, intptr_t stridea, const pel10_t *b, intptr_t strideb)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            a[x] = (int16_t)b[x];\
+        }\
+        a += stridea;\
+        b += strideb;\
+    }\
+}
+
+#define PIXEL_SUB_PS8_C(w, h) \
+static void xavs2_pixel_sub_ps8_##w##x##h(coeff_t *a, intptr_t dstride, const pel8_t *b0, const pel8_t *b1, intptr_t sstride0, intptr_t sstride1)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            a[x] = (int16_t)(b0[x] - b1[x]);\
+        }\
+        b0 += sstride0;\
+        b1 += sstride1;\
+        a  += dstride;\
+    }\
+}
+
+#define PIXEL_SUB_PS10_C(w, h) \
+static void xavs2_pixel_sub_ps10_##w##x##h(coeff_t *a, intptr_t dstride, const pel10_t *b0, const pel10_t *b1, intptr_t sstride0, intptr_t sstride1)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
@@ -791,13 +1381,15 @@ static void xavs2_pixel_sub_ps_##w##x##h(coeff_t *a, intptr_t dstride, const pel
     }\
 }
 
-#define PIXEL_ADD_PS_C(w, h) \
-static void xavs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\
+#define XAVS2_CLIP1(cc, bb)        ((cc) > ((1 << bb->param->input_sample_bit_depth) - 1) ? ((1 << bb->param->input_sample_bit_depth) - 1) : ((cc) < 0 ? 0 : (cc)))
+
+#define PIXEL_ADD_PS8_C(w, h) \
+static void xavs2_pixel_add_ps8_##w##x##h(xavs2_t* bb, pel8_t *a, intptr_t dstride, const pel8_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\
 {\
     int x, y;\
     for (y = 0; y < h; y++) {\
         for (x = 0; x < w; x++) {\
-            a[x] = (pel_t)XAVS2_CLIP1(b0[x] + b1[x]);\
+            a[x] = (pel8_t)XAVS2_CLIP1(b0[x] + b1[x], bb);\
         }\
         b0 += sstride0;\
         b1 += sstride1;\
@@ -805,49 +1397,98 @@ static void xavs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t
     }\
 }
 
-#define BLOCK_OP_C(w, h) \
-    BLOCKCOPY_PP_C(w, h);\
-    BLOCKCOPY_SS_C(w, h);\
-    BLOCKCOPY_SP_C(w, h);\
-    BLOCKCOPY_PS_C(w, h);\
-    PIXEL_SUB_PS_C(w, h);\
-    PIXEL_ADD_PS_C(w, h);
-
-BLOCK_OP_C(64, 64)  /* 64x64 */
-BLOCK_OP_C(64, 32)
-BLOCK_OP_C(32, 64)
-BLOCK_OP_C(64, 16)
-BLOCK_OP_C(64, 48)
-BLOCK_OP_C(16, 64)
-BLOCK_OP_C(48, 64)
-BLOCK_OP_C(32, 32)  /* 32x32 */
-BLOCK_OP_C(32, 16)
-BLOCK_OP_C(16, 32)
-BLOCK_OP_C(32,  8)
-BLOCK_OP_C(32, 24)
-BLOCK_OP_C( 8, 32)
-BLOCK_OP_C(24, 32)
-BLOCK_OP_C(16, 16)  /* 16x16 */
-BLOCK_OP_C(16,  8)
-BLOCK_OP_C( 8, 16)
-BLOCK_OP_C(16,  4)
-BLOCK_OP_C(16, 12)
-BLOCK_OP_C( 4, 16)
-BLOCK_OP_C(12, 16)
-BLOCK_OP_C( 8,  8)  /* 8x8 */
-BLOCK_OP_C( 8,  4)
-BLOCK_OP_C( 4,  8)
-BLOCK_OP_C( 4,  4)  /* 4x4 */
+#define BLOCK_OP8_C(w, h) \
+    BLOCKCOPY_PP8_C(w, h);\
+    BLOCKCOPY_SS8_C(w, h);\
+    BLOCKCOPY_SP8_C(w, h);\
+    BLOCKCOPY_PS8_C(w, h);\
+    PIXEL_SUB_PS8_C(w, h);\
+    PIXEL_ADD_PS8_C(w, h);
+
+BLOCK_OP8_C(64, 64)  /* 64x64 */
+BLOCK_OP8_C(64, 32)
+BLOCK_OP8_C(32, 64)
+BLOCK_OP8_C(64, 16)
+BLOCK_OP8_C(64, 48)
+BLOCK_OP8_C(16, 64)
+BLOCK_OP8_C(48, 64)
+BLOCK_OP8_C(32, 32)  /* 32x32 */
+BLOCK_OP8_C(32, 16)
+BLOCK_OP8_C(16, 32)
+BLOCK_OP8_C(32,  8)
+BLOCK_OP8_C(32, 24)
+BLOCK_OP8_C( 8, 32)
+BLOCK_OP8_C(24, 32)
+BLOCK_OP8_C(16, 16)  /* 16x16 */
+BLOCK_OP8_C(16,  8)
+BLOCK_OP8_C( 8, 16)
+BLOCK_OP8_C(16,  4)
+BLOCK_OP8_C(16, 12)
+BLOCK_OP8_C( 4, 16)
+BLOCK_OP8_C(12, 16)
+BLOCK_OP8_C( 8,  8)  /* 8x8 */
+BLOCK_OP8_C( 8,  4)
+BLOCK_OP8_C( 4,  8)
+BLOCK_OP8_C( 4,  4)  /* 4x4 */
+
+#define PIXEL_ADD_PS10_C(w, h) \
+static void xavs2_pixel_add_ps10_##w##x##h(xavs2_t* bb, pel10_t *a, intptr_t dstride, const pel10_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\
+{\
+    int x, y;\
+    for (y = 0; y < h; y++) {\
+        for (x = 0; x < w; x++) {\
+            a[x] = (pel10_t)XAVS2_CLIP1(b0[x] + b1[x], bb);\
+        }\
+        b0 += sstride0;\
+        b1 += sstride1;\
+        a  += dstride;\
+    }\
+}
 
+#define BLOCK_OP10_C(w, h) \
+    BLOCKCOPY_PP10_C(w, h);\
+    BLOCKCOPY_SS10_C(w, h);\
+    BLOCKCOPY_SP10_C(w, h);\
+    BLOCKCOPY_PS10_C(w, h);\
+    PIXEL_SUB_PS10_C(w, h);\
+    PIXEL_ADD_PS10_C(w, h);
+
+BLOCK_OP10_C(64, 64)  /* 64x64 */
+BLOCK_OP10_C(64, 32)
+BLOCK_OP10_C(32, 64)
+BLOCK_OP10_C(64, 16)
+BLOCK_OP10_C(64, 48)
+BLOCK_OP10_C(16, 64)
+BLOCK_OP10_C(48, 64)
+BLOCK_OP10_C(32, 32)  /* 32x32 */
+BLOCK_OP10_C(32, 16)
+BLOCK_OP10_C(16, 32)
+BLOCK_OP10_C(32,  8)
+BLOCK_OP10_C(32, 24)
+BLOCK_OP10_C( 8, 32)
+BLOCK_OP10_C(24, 32)
+BLOCK_OP10_C(16, 16)  /* 16x16 */
+BLOCK_OP10_C(16,  8)
+BLOCK_OP10_C( 8, 16)
+BLOCK_OP10_C(16,  4)
+BLOCK_OP10_C(16, 12)
+BLOCK_OP10_C( 4, 16)
+BLOCK_OP10_C(12, 16)
+BLOCK_OP10_C( 8,  8)  /* 8x8 */
+BLOCK_OP10_C( 8,  4)
+BLOCK_OP10_C( 4,  8)
+BLOCK_OP10_C( 4,  4)  /* 4x4 */
+
+//#if !HIGH_BIT_DEPTH
 /* ---------------------------------------------------------------------------
  */
-static void xavs2_pixel_average(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height)
+static void xavs2_pixel_average8(pel8_t *dst, int i_dst, pel8_t *src1, int i_src1, pel8_t *src2, int i_src2, int width, int height)
 {
     int i, j;
 
     for (i = 0; i < height; i++) {
         for (j = 0; j < width; j++) {
-            dst[j] = (pel_t)((src1[j] + src2[j] + 1) >> 1);
+            dst[j] = (pel8_t)((src1[j] + src2[j] + 1) >> 1);
         }
         dst  += i_dst;
         src1 += i_src1;
@@ -855,10 +1496,25 @@ static void xavs2_pixel_average(pel_t *dst, int i_dst, pel_t *src1, int i_src1,
     }
 }
 
+static void xavs2_pixel_average10(pel10_t *dst, int i_dst, pel10_t *src1, int i_src1, pel10_t *src2, int i_src2, int width, int height)
+{
+    int i, j;
+
+    for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+            dst[j] = (pel10_t)((src1[j] + src2[j] + 1) >> 1);
+        }
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+//#endif
+
 /* ---------------------------------------------------------------------------
  * init functions of block operation : copy / add / sub
  */
-static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
+static void init_block_opreation_funcs(xavs2_param_t* param, uint32_t cpuid, pixel_funcs_t* pixf)
 {
 #define ALL_LUMA_CU(name1, name2, cpu) \
     pixf->name1[LUMA_64x64] = xavs2_ ## name2 ## _64x64 ## cpu;\
@@ -897,27 +1553,104 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
     /* -------------------------------------------------------------
      * init all c functions
      */
+    if (param->input_sample_bit_depth == 8) {
     //ALL_LUMA_CU(add_ps,  pixel_add_ps, );
-    ALL_LUMA_PU(add_ps,  pixel_add_ps, );
+    ALL_LUMA_PU(add_ps8,  pixel_add_ps8, );
 //    ALL_LUMA_CU(sub_ps,  pixel_sub_ps, );
-    ALL_LUMA_PU(sub_ps,  pixel_sub_ps, );
-    ALL_LUMA_PU(copy_sp, blockcopy_sp, );
-    ALL_LUMA_PU(copy_ps, blockcopy_ps, );
-    ALL_LUMA_PU(copy_ss, blockcopy_ss, );
-    ALL_LUMA_PU(copy_pp, blockcopy_pp, );
-    pixf->ssd_block = xavs2_get_block_ssd_c;
+    ALL_LUMA_PU(sub_ps8,  pixel_sub_ps8, );
+    ALL_LUMA_PU(copy_sp8, blockcopy_sp8, );
+    ALL_LUMA_PU(copy_ps8, blockcopy_ps8, );
+    ALL_LUMA_PU(copy_ss8, blockcopy_ss8, );
+    ALL_LUMA_PU(copy_pp8, blockcopy_pp8, );
+//#if !HIGH_BIT_DEPTH
+    pixf->ssd_block8 = xavs2_get_block_ssd8_c;
+//#endif
+    } else {
+    //ALL_LUMA_CU(add_ps10,  pixel_add_ps10, );
+    ALL_LUMA_PU(add_ps10,  pixel_add_ps10, );
+//    ALL_LUMA_CU(sub_ps10,  pixel_sub_ps10, );
+    ALL_LUMA_PU(sub_ps10,  pixel_sub_ps10, );
+    ALL_LUMA_PU(copy_sp10, blockcopy_sp10, );
+    ALL_LUMA_PU(copy_ps10, blockcopy_ps10, );
+    ALL_LUMA_PU(copy_ss10, blockcopy_ss10, );
+    ALL_LUMA_PU(copy_pp10, blockcopy_pp10, );
+//#if !HIGH_BIT_DEPTH
+    pixf->ssd_block10 = xavs2_get_block_ssd10_c;
+//#endif
+    }
 
     /* -------------------------------------------------------------
      * init all SIMD functions
      */
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_SSE2) {
+#if HIGH_BIT_DEPTH
+        //10bit assemble
+        if (sizeof(pel_t) == sizeof(int16_t) && cpuid) {
+            pixf->copy_pp[LUMA_64x64] = (copy_pp_t)xavs2_blockcopy_ss_64x64_sse2;  /* 64x64 */
+            pixf->copy_pp[LUMA_64x32] = (copy_pp_t)xavs2_blockcopy_ss_64x32_sse2;
+            pixf->copy_pp[LUMA_32x64] = (copy_pp_t)xavs2_blockcopy_ss_32x64_sse2;
+            pixf->copy_pp[LUMA_64x16] = (copy_pp_t)xavs2_blockcopy_ss_64x16_sse2;
+            pixf->copy_pp[LUMA_64x48] = (copy_pp_t)xavs2_blockcopy_ss_64x48_sse2;
+            pixf->copy_pp[LUMA_16x64] = (copy_pp_t)xavs2_blockcopy_ss_16x64_sse2;
+            pixf->copy_pp[LUMA_48x64] = (copy_pp_t)xavs2_blockcopy_ss_48x64_sse2;
+            pixf->copy_pp[LUMA_32x32] = (copy_pp_t)xavs2_blockcopy_ss_32x32_sse2; /* 32x32 */
+            pixf->copy_pp[LUMA_32x16] = (copy_pp_t)xavs2_blockcopy_ss_32x16_sse2;
+            pixf->copy_pp[LUMA_16x32] = (copy_pp_t)xavs2_blockcopy_ss_16x32_sse2;
+            pixf->copy_pp[LUMA_32x8 ] = (copy_pp_t)xavs2_blockcopy_ss_32x8_sse2;
+            pixf->copy_pp[LUMA_32x24] = (copy_pp_t)xavs2_blockcopy_ss_32x24_sse2;
+            pixf->copy_pp[LUMA_8x32 ] = (copy_pp_t)xavs2_blockcopy_ss_8x32_sse2;
+            pixf->copy_pp[LUMA_24x32] = (copy_pp_t)xavs2_blockcopy_ss_24x32_sse2;
+            pixf->copy_pp[LUMA_16x16] = (copy_pp_t)xavs2_blockcopy_ss_16x16_sse2; /* 16x16 */
+            pixf->copy_pp[LUMA_16x8 ] = (copy_pp_t)xavs2_blockcopy_ss_16x8_sse2;
+            pixf->copy_pp[LUMA_8x16 ] = (copy_pp_t)xavs2_blockcopy_ss_8x16_sse2;
+            pixf->copy_pp[LUMA_16x4 ] = (copy_pp_t)xavs2_blockcopy_ss_16x4_sse2;
+            pixf->copy_pp[LUMA_16x12] = (copy_pp_t)xavs2_blockcopy_ss_16x12_sse2;
+            pixf->copy_pp[LUMA_4x16 ] = (copy_pp_t)xavs2_blockcopy_ss_4x16_sse2;
+            pixf->copy_pp[LUMA_12x16] = (copy_pp_t)xavs2_blockcopy_ss_12x16_sse2;
+            pixf->copy_pp[LUMA_8x8  ] = (copy_pp_t)xavs2_blockcopy_ss_8x8_sse2; /* 8x8 */
+            pixf->copy_pp[LUMA_8x4  ] = (copy_pp_t)xavs2_blockcopy_ss_8x4_sse2;
+            pixf->copy_pp[LUMA_4x8  ] = (copy_pp_t)xavs2_blockcopy_ss_4x8_sse2;
+            pixf->copy_pp[LUMA_4x4  ] = (copy_pp_t)xavs2_blockcopy_ss_4x4_sse2;  /* 4x4 */
+        }
+        if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) {
+            pixf->copy_ss[LUMA_64x64] = (copy_ss_t)xavs2_blockcopy_ss_64x64_sse2;  /* 64x64 */
+            pixf->copy_ss[LUMA_64x32] = (copy_ss_t)xavs2_blockcopy_ss_64x32_sse2;
+            pixf->copy_ss[LUMA_32x64] = (copy_ss_t)xavs2_blockcopy_ss_32x64_sse2;
+            pixf->copy_ss[LUMA_64x16] = (copy_ss_t)xavs2_blockcopy_ss_64x16_sse2;
+            pixf->copy_ss[LUMA_64x48] = (copy_ss_t)xavs2_blockcopy_ss_64x48_sse2;
+            pixf->copy_ss[LUMA_16x64] = (copy_ss_t)xavs2_blockcopy_ss_16x64_sse2;
+            pixf->copy_ss[LUMA_48x64] = (copy_ss_t)xavs2_blockcopy_ss_48x64_sse2;
+            pixf->copy_ss[LUMA_32x32] = (copy_ss_t)xavs2_blockcopy_ss_32x32_sse2; /* 32x32 */
+            pixf->copy_ss[LUMA_32x16] = (copy_ss_t)xavs2_blockcopy_ss_32x16_sse2;
+            pixf->copy_ss[LUMA_16x32] = (copy_ss_t)xavs2_blockcopy_ss_16x32_sse2;
+            pixf->copy_ss[LUMA_32x8 ] = (copy_ss_t)xavs2_blockcopy_ss_32x8_sse2;
+            pixf->copy_ss[LUMA_32x24] = (copy_ss_t)xavs2_blockcopy_ss_32x24_sse2;
+            pixf->copy_ss[LUMA_8x32 ] = (copy_ss_t)xavs2_blockcopy_ss_8x32_sse2;
+            pixf->copy_ss[LUMA_24x32] = (copy_ss_t)xavs2_blockcopy_ss_24x32_sse2;
+            pixf->copy_ss[LUMA_16x16] = (copy_ss_t)xavs2_blockcopy_ss_16x16_sse2; /* 16x16 */
+            pixf->copy_ss[LUMA_16x8 ] = (copy_ss_t)xavs2_blockcopy_ss_16x8_sse2;
+            pixf->copy_ss[LUMA_8x16 ] = (copy_ss_t)xavs2_blockcopy_ss_8x16_sse2;
+            pixf->copy_ss[LUMA_16x4 ] = (copy_ss_t)xavs2_blockcopy_ss_16x4_sse2;
+            pixf->copy_ss[LUMA_16x12] = (copy_ss_t)xavs2_blockcopy_ss_16x12_sse2;
+            pixf->copy_ss[LUMA_4x16 ] = (copy_ss_t)xavs2_blockcopy_ss_4x16_sse2;
+            pixf->copy_ss[LUMA_12x16] = (copy_ss_t)xavs2_blockcopy_ss_12x16_sse2;
+            pixf->copy_ss[LUMA_8x8  ] = (copy_ss_t)xavs2_blockcopy_ss_8x8_sse2; /* 8x8 */
+            pixf->copy_ss[LUMA_8x4  ] = (copy_ss_t)xavs2_blockcopy_ss_8x4_sse2;
+            pixf->copy_ss[LUMA_4x8  ] = (copy_ss_t)xavs2_blockcopy_ss_4x8_sse2;
+            pixf->copy_ss[LUMA_4x4  ] = (copy_ss_t)xavs2_blockcopy_ss_4x4_sse2;  /* 4x4 */
+        }
+#else
         ALL_LUMA_PU(copy_sp, blockcopy_sp, _sse2);
         ALL_LUMA_PU(copy_ss, blockcopy_ss, _sse2);
         ALL_LUMA_PU(copy_pp, blockcopy_pp, _sse2);
+#endif
     }
 
     if (cpuid & XAVS2_CPU_SSE4) {
+#if HIGH_BIT_DEPTH
+        //10bit assemble
+#else
         pixf->add_ps [LUMA_4x4  ] = xavs2_pixel_add_ps_4x4_sse4;
         pixf->add_ps [LUMA_4x8  ] = xavs2_pixel_add_ps_4x8_sse4;
         pixf->add_ps [LUMA_4x16 ] = xavs2_pixel_add_ps_4x16_sse4;
@@ -961,9 +1694,51 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sub_ps [LUMA_64x64] = xavs2_pixel_sub_ps_64x64_sse4;
 
         ALL_LUMA_PU(copy_ps, blockcopy_ps, _sse4);
+#endif
     }
 
-    if (cpuid & XAVS2_CPU_AVX) {
+    if (cpuid & XAVS2_CPU_AVX2) {
+#if HIGH_BIT_DEPTH
+        //10bit assemble
+        if (sizeof(pel_t) == sizeof(int16_t) && cpuid) {
+            pixf->copy_pp[LUMA_64x64] = (copy_pp_t)xavs2_blockcopy_ss_64x64_avx;
+            pixf->copy_pp[LUMA_64x32] = (copy_pp_t)xavs2_blockcopy_ss_64x32_avx;
+            pixf->copy_pp[LUMA_32x64] = (copy_pp_t)xavs2_blockcopy_ss_32x64_avx;
+            pixf->copy_pp[LUMA_64x16] = (copy_pp_t)xavs2_blockcopy_ss_64x16_avx;
+            pixf->copy_pp[LUMA_64x48] = (copy_pp_t)xavs2_blockcopy_ss_64x48_avx;
+            pixf->copy_pp[LUMA_16x64] = (copy_pp_t)xavs2_blockcopy_ss_16x64_avx;
+            pixf->copy_pp[LUMA_48x64] = (copy_pp_t)xavs2_blockcopy_ss_48x64_avx;
+            pixf->copy_pp[LUMA_32x32] = (copy_pp_t)xavs2_blockcopy_ss_32x32_avx;
+            pixf->copy_pp[LUMA_32x16] = (copy_pp_t)xavs2_blockcopy_ss_32x16_avx;
+            pixf->copy_pp[LUMA_16x32] = (copy_pp_t)xavs2_blockcopy_ss_16x32_avx;
+            pixf->copy_pp[LUMA_32x8 ] = (copy_pp_t)xavs2_blockcopy_ss_32x8_avx;
+            pixf->copy_pp[LUMA_32x24] = (copy_pp_t)xavs2_blockcopy_ss_32x24_avx;
+            pixf->copy_pp[LUMA_24x32] = (copy_pp_t)xavs2_blockcopy_ss_24x32_avx;
+            pixf->copy_pp[LUMA_16x16] = (copy_pp_t)xavs2_blockcopy_ss_16x16_avx;
+            pixf->copy_pp[LUMA_16x8 ] = (copy_pp_t)xavs2_blockcopy_ss_16x8_avx;
+            pixf->copy_pp[LUMA_16x4 ] = (copy_pp_t)xavs2_blockcopy_ss_16x4_avx;
+            pixf->copy_pp[LUMA_16x12] = (copy_pp_t)xavs2_blockcopy_ss_16x12_avx;
+        }
+        if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) {
+            pixf->copy_ss[LUMA_64x64] = (copy_ss_t)xavs2_blockcopy_ss_64x64_avx;
+            pixf->copy_ss[LUMA_64x32] = (copy_ss_t)xavs2_blockcopy_ss_64x32_avx;
+            pixf->copy_ss[LUMA_32x64] = (copy_ss_t)xavs2_blockcopy_ss_32x64_avx;
+            pixf->copy_ss[LUMA_64x16] = (copy_ss_t)xavs2_blockcopy_ss_64x16_avx;
+            pixf->copy_ss[LUMA_64x48] = (copy_ss_t)xavs2_blockcopy_ss_64x48_avx;
+            pixf->copy_ss[LUMA_16x64] = (copy_ss_t)xavs2_blockcopy_ss_16x64_avx;
+            pixf->copy_ss[LUMA_48x64] = (copy_ss_t)xavs2_blockcopy_ss_48x64_avx;
+            pixf->copy_ss[LUMA_32x32] = (copy_ss_t)xavs2_blockcopy_ss_32x32_avx;
+            pixf->copy_ss[LUMA_32x16] = (copy_ss_t)xavs2_blockcopy_ss_32x16_avx;
+            pixf->copy_ss[LUMA_16x32] = (copy_ss_t)xavs2_blockcopy_ss_16x32_avx;
+            pixf->copy_ss[LUMA_32x8 ] = (copy_ss_t)xavs2_blockcopy_ss_32x8_avx;
+            pixf->copy_ss[LUMA_32x24] = (copy_ss_t)xavs2_blockcopy_ss_32x24_avx;
+            pixf->copy_ss[LUMA_24x32] = (copy_ss_t)xavs2_blockcopy_ss_24x32_avx;
+            pixf->copy_ss[LUMA_16x16] = (copy_ss_t)xavs2_blockcopy_ss_16x16_avx;
+            pixf->copy_ss[LUMA_16x8 ] = (copy_ss_t)xavs2_blockcopy_ss_16x8_avx;
+            pixf->copy_ss[LUMA_16x4 ] = (copy_ss_t)xavs2_blockcopy_ss_16x4_avx;
+            pixf->copy_ss[LUMA_16x12] = (copy_ss_t)xavs2_blockcopy_ss_16x12_avx;
+        }
+#else
         pixf->copy_pp[LUMA_64x64] = xavs2_blockcopy_pp_64x64_avx;
         pixf->copy_pp[LUMA_64x32] = xavs2_blockcopy_pp_64x32_avx;
         pixf->copy_pp[LUMA_32x64] = xavs2_blockcopy_pp_32x64_avx;
@@ -992,9 +1767,14 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->copy_ss[LUMA_16x8 ] = xavs2_blockcopy_ss_16x8_avx;
         pixf->copy_ss[LUMA_16x4 ] = xavs2_blockcopy_ss_16x4_avx;
         pixf->copy_ss[LUMA_16x12] = xavs2_blockcopy_ss_16x12_avx;
+#endif
     }
 
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
+#if HIGH_BIT_DEPTH
+        //10bit assemble
+#else
         pixf->add_ps [LUMA_16x4 ] = xavs2_pixel_add_ps_16x4_avx2;
         pixf->add_ps [LUMA_16x8 ] = xavs2_pixel_add_ps_16x8_avx2;
         pixf->add_ps [LUMA_16x16] = xavs2_pixel_add_ps_16x16_avx2;
@@ -1035,7 +1815,9 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->copy_ps[LUMA_32x32] = xavs2_blockcopy_ps_32x32_avx2;
         pixf->copy_ps[LUMA_16x32] = xavs2_blockcopy_ps_16x32_avx2;
         pixf->copy_ps[LUMA_16x16] = xavs2_blockcopy_ps_16x16_avx2;
+#endif
     }
+#endif
 #endif // if HAVE_MMX
 
 #undef ALL_LUMA_CU
@@ -1047,8 +1829,9 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf)
  * pixel init
  * ---------------------------------------------------------------------------
  */
-void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
+void xavs2_pixel_init(xavs2_param_t* param, uint32_t cpuid, pixel_funcs_t* pixf)
 {
+//#if !HIGH_BIT_DEPTH
     /* -------------------------------------------------------------
      */
 #define INIT_PIXEL_FUNC(name, cpu) \
@@ -1082,7 +1865,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
     pixf->name[LUMA_4x8  ] = xavs2_pixel_ ## name ## _4x8   ## cpu;\
     /* 4x4 */                                                    \
     pixf->name[LUMA_4x4  ] = xavs2_pixel_ ## name ## _4x4   ## cpu;
-
+//#endif
 
     /* -------------------------------------------------------------
      */
@@ -1112,6 +1895,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
     pixf->satd[LUMA_8x4  ] = xavs2_pixel_satd_8x4_   ## cpu;\
     pixf->satd[LUMA_4x8  ] = xavs2_pixel_satd_4x8_   ## cpu;
 
+//#if !HIGH_BIT_DEPTH
     /* -------------------------------------------------------------
      */
 #define INIT_SSD(cpu) \
@@ -1138,21 +1922,35 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
     /* -------------------------------------------------------------
      * init all c functions
      */
-    INIT_PIXEL_FUNC(sad,    );        // sad
-    INIT_PIXEL_FUNC(sad_x3, );        // sad_x3
-    INIT_PIXEL_FUNC(sad_x4, );        // sad_x4
-    INIT_PIXEL_FUNC(satd,   );        // satd
-    INIT_PIXEL_FUNC(ssd,    );        // ssd
-    INIT_PIXEL_FUNC(avg,    );        // avg
-    INIT_PIXEL_FUNC(sa8d,   );        // sa8d
-
-    pixf->average = xavs2_pixel_average;// block average
+    if (param->input_sample_bit_depth == 8) {
+    INIT_PIXEL_FUNC(sad8,    );        // sad
+    INIT_PIXEL_FUNC(sad8_x3, );        // sad_x3
+    INIT_PIXEL_FUNC(sad8_x4, );        // sad_x4
+    INIT_PIXEL_FUNC(satd8,   );        // satd
+    INIT_PIXEL_FUNC(ssd8,    );        // ssd
+    INIT_PIXEL_FUNC(avg8,    );        // avg
+    INIT_PIXEL_FUNC(sa8d8,   );        // sa8d
+
+    pixf->average8 = xavs2_pixel_average8;// block average
+    } else {
+    INIT_PIXEL_FUNC(sad10,    );        // sad
+    INIT_PIXEL_FUNC(sad10_x3, );        // sad_x3
+    INIT_PIXEL_FUNC(sad10_x4, );        // sad_x4
+    INIT_PIXEL_FUNC(satd10,   );        // satd
+    INIT_PIXEL_FUNC(ssd10,    );        // ssd
+    INIT_PIXEL_FUNC(avg10,    );        // avg
+    INIT_PIXEL_FUNC(sa8d10,   );        // sa8d
+
+    pixf->average10 = xavs2_pixel_average10;// block average
+    }
+//#endif
 
     /* -------------------------------------------------------------
      * init SIMD functions
      */
 #if HAVE_MMX
     if (cpuid & XAVS2_CPU_MMX2) {
+#if !HIGH_BIT_DEPTH
         pixf->sad   [LUMA_16x16] = xavs2_pixel_sad_16x16_mmx2;
         pixf->sad   [LUMA_16x8 ] = xavs2_pixel_sad_16x8_mmx2;
         pixf->sad   [LUMA_8x16 ] = xavs2_pixel_sad_8x16_mmx2;
@@ -1190,6 +1988,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->ssd   [LUMA_8x4  ] = xavs2_pixel_ssd_8x4_mmx;
         pixf->ssd   [LUMA_4x8  ] = xavs2_pixel_ssd_4x8_mmx;
         pixf->ssd   [LUMA_4x4  ] = xavs2_pixel_ssd_4x4_mmx;
+#endif
 
         pixf->satd  [LUMA_16x16] = xavs2_pixel_satd_16x16_mmx2;
         pixf->satd  [LUMA_16x8 ] = xavs2_pixel_satd_16x8_mmx2;
@@ -1211,6 +2010,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
     }
 
     if (cpuid & XAVS2_CPU_SSE2) {
+#if !HIGH_BIT_DEPTH
         pixf->sad   [LUMA_16x16] = xavs2_pixel_sad_16x16_sse2;
         pixf->sad   [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse2;
         pixf->sad   [LUMA_16x12] = xavs2_pixel_sad_16x12_sse2;
@@ -1232,6 +2032,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad   [LUMA_48x64] = xavs2_pixel_sad_48x64_sse2;
         pixf->sad   [LUMA_24x32] = xavs2_pixel_sad_24x32_sse2;
         pixf->sad   [LUMA_12x16] = xavs2_pixel_sad_12x16_sse2;
+#endif
         pixf->sa8d  [LUMA_64x16] = xavs2_pixel_sa8d_64x16_sse2;
         pixf->sa8d  [LUMA_64x32] = xavs2_pixel_sa8d_64x32_sse2;
         pixf->sa8d  [LUMA_64x48] = xavs2_pixel_sa8d_64x48_sse2;
@@ -1245,9 +2046,9 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sa8d  [LUMA_32x32] = xavs2_pixel_sa8d_32x32_sse2;
         pixf->sa8d  [LUMA_64x64] = xavs2_pixel_sa8d_64x64_sse2;
 
-
         INIT_SATD(sse2);
 
+#if !HIGH_BIT_DEPTH
         pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_sse2;
         pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_sse2;
         pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_sse2;
@@ -1261,9 +2062,10 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad_x4[LUMA_8x4  ] = xavs2_pixel_sad_x4_8x4_sse2;
 
         INIT_SSD (sse2);
-
+#endif
     }
 
+#if !HIGH_BIT_DEPTH
     if (cpuid & XAVS2_CPU_SSE3) {
         pixf->sad   [LUMA_16x16] = xavs2_pixel_sad_16x16_sse3;
         pixf->sad   [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse3;
@@ -1292,12 +2094,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_sse3;
         pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_sse3;
         pixf->sad_x4[LUMA_16x4 ] = xavs2_pixel_sad_x4_16x4_sse3;
-
     }
+#endif
 
     if (cpuid & XAVS2_CPU_SSSE3) {
         INIT_SATD(ssse3);
 
+#if !HIGH_BIT_DEPTH
         pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_ssse3;    /* 64x64 */
         pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_ssse3;
         pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_ssse3;
@@ -1337,6 +2140,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_ssse3;
 
         INIT_SSD (ssse3);
+#endif
 
         pixf->sa8d  [LUMA_4x4  ] = xavs2_pixel_satd_4x4_ssse3;
         pixf->sa8d  [LUMA_8x8  ] = xavs2_pixel_sa8d_8x8_ssse3;
@@ -1345,11 +2149,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sa8d  [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_ssse3;
         pixf->sa8d  [LUMA_16x32] = xavs2_pixel_sa8d_16x32_ssse3;
         pixf->sa8d  [LUMA_32x64] = xavs2_pixel_sa8d_32x64_ssse3;
-
     }
 
     if (cpuid & XAVS2_CPU_SSE4) {
         INIT_SATD(sse4);
+//#if !HIGH_BIT_DEPTH
         pixf->ssd   [LUMA_12x16] = xavs2_pixel_ssd_12x16_sse4;
         pixf->ssd   [LUMA_24x32] = xavs2_pixel_ssd_24x32_sse4;
         pixf->ssd   [LUMA_48x64] = xavs2_pixel_ssd_48x64_sse4;
@@ -1357,6 +2161,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->ssd   [LUMA_64x32] = xavs2_pixel_ssd_64x32_sse4;
         pixf->ssd   [LUMA_64x48] = xavs2_pixel_ssd_64x48_sse4;
         pixf->ssd   [LUMA_64x64] = xavs2_pixel_ssd_64x64_sse4;
+//#endif
 
         pixf->sa8d  [LUMA_4x4  ] = xavs2_pixel_satd_4x4_sse4;
         pixf->sa8d  [LUMA_8x8  ] = xavs2_pixel_sa8d_8x8_sse4;
@@ -1365,11 +2170,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sa8d  [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_sse4;
         pixf->sa8d  [LUMA_16x32] = xavs2_pixel_sa8d_16x32_sse4;
         pixf->sa8d  [LUMA_32x64] = xavs2_pixel_sa8d_32x64_sse4;
-
     }
 
     if (cpuid & XAVS2_CPU_AVX) {
         INIT_SATD(avx);
+#if !HIGH_BIT_DEPTH
         pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_avx;  /* 64x64 */
         pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_avx;
         pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_avx;
@@ -1409,6 +2214,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_avx;
 
         INIT_SSD (avx);
+#endif
 
         pixf->sa8d  [LUMA_4x4  ] = xavs2_pixel_satd_4x4_avx;
         pixf->sa8d  [LUMA_8x8  ] = xavs2_pixel_sa8d_8x8_avx;
@@ -1418,16 +2224,18 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sa8d  [LUMA_16x32] = xavs2_pixel_sa8d_16x32_avx;
         pixf->sa8d  [LUMA_32x64] = xavs2_pixel_sa8d_32x64_avx;
         pixf->sa8d  [LUMA_64x64] = xavs2_pixel_sa8d_64x64_avx;
-
     }
 
+#if defined(__XOP__)
     if (cpuid & XAVS2_CPU_XOP) {
         INIT_SATD(xop);
+#if !HIGH_BIT_DEPTH
         pixf->ssd   [LUMA_16x16] = xavs2_pixel_ssd_16x16_xop;
         pixf->ssd   [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_xop;
         pixf->ssd   [LUMA_8x16 ] = xavs2_pixel_ssd_8x16_xop;
         pixf->ssd   [LUMA_8x8  ] = xavs2_pixel_ssd_8x8_xop;
         pixf->ssd   [LUMA_8x4  ] = xavs2_pixel_ssd_8x4_xop;
+#endif
 
         //pixf->sa8d  [LUMA_4x4  ] = xavs2_pixel_satd_4x4_xop; // in x265, this one is broken
         pixf->sa8d  [LUMA_8x8  ] = xavs2_pixel_sa8d_8x8_xop;
@@ -1436,11 +2244,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sa8d  [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_xop;
         pixf->sa8d  [LUMA_16x32] = xavs2_pixel_sa8d_16x32_xop;
         pixf->sa8d  [LUMA_32x64] = xavs2_pixel_sa8d_32x64_xop;
-
     }
+#endif
 
+#if defined(__AVX2__)
 #if ARCH_X86_64
     if (cpuid & XAVS2_CPU_AVX2) {
+#if !HIGH_BIT_DEPTH
         pixf->sad   [LUMA_32x8 ] = xavs2_pixel_sad_32x8_avx2;
         pixf->sad   [LUMA_32x16] = xavs2_pixel_sad_32x16_avx2;
         pixf->sad   [LUMA_32x24] = xavs2_pixel_sad_32x24_avx2;
@@ -1461,6 +2271,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->ssd   [LUMA_32x32] = xavs2_pixel_ssd_32x32_avx2;
         pixf->ssd   [LUMA_16x16] = xavs2_pixel_ssd_16x16_avx2;
         pixf->ssd   [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_avx2;
+#endif
 
         pixf->satd  [LUMA_16x16] = xavs2_pixel_satd_16x16_avx2;
         pixf->satd  [LUMA_16x8 ] = xavs2_pixel_satd_16x8_avx2;
@@ -1480,6 +2291,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->satd  [LUMA_16x4 ] = xavs2_pixel_satd_16x4_avx2;
         pixf->satd  [LUMA_16x12] = xavs2_pixel_satd_16x12_avx2;
 
+#if !HIGH_BIT_DEPTH
         pixf->sad_x3[LUMA_32x8 ] = xavs2_pixel_sad_x3_32x8_avx2;
         pixf->sad_x3[LUMA_32x16] = xavs2_pixel_sad_x3_32x16_avx2;
         pixf->sad_x3[LUMA_32x24] = xavs2_pixel_sad_x3_32x24_avx2;
@@ -1505,11 +2317,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         pixf->sad_x4[LUMA_64x32] = xavs2_pixel_sad_x4_64x32_avx2;
         pixf->sad_x4[LUMA_64x48] = xavs2_pixel_sad_x4_64x48_avx2;
         pixf->sad_x4[LUMA_64x64] = xavs2_pixel_sad_x4_64x64_avx2;
+#endif
 
         pixf->sa8d  [LUMA_8x8  ] = xavs2_pixel_sa8d_8x8_avx2;
         pixf->sa8d  [LUMA_16x16] = xavs2_pixel_sa8d_16x16_avx2;
         pixf->sa8d  [LUMA_32x32] = xavs2_pixel_sa8d_32x32_avx2;
     }
+#endif
 #endif
 
     /* -------------------------------------------------------------
@@ -1534,12 +2348,14 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         INIT_PIXEL_AVG(16,  8, mmx2);
         INIT_PIXEL_AVG(16,  4, mmx2);
         INIT_PIXEL_AVG(16, 12, mmx2);
+#if !HIGH_BIT_DEPTH
         INIT_PIXEL_AVG( 8, 32, mmx2);
         INIT_PIXEL_AVG( 8, 16, mmx2);
-        INIT_PIXEL_AVG( 4, 16, mmx2);
-        INIT_PIXEL_AVG(12, 16, mmx2);
         INIT_PIXEL_AVG( 8,  8, mmx2);
         INIT_PIXEL_AVG( 8,  4, mmx2);
+#endif
+        INIT_PIXEL_AVG( 4, 16, mmx2);
+        INIT_PIXEL_AVG(12, 16, mmx2);
         INIT_PIXEL_AVG( 4,  8, mmx2);
         INIT_PIXEL_AVG( 4,  4, mmx2);
     }
@@ -1569,10 +2385,24 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         INIT_PIXEL_AVG( 8,  4, sse2);
     }
 
+#if !HIGH_BIT_DEPTH
     if (cpuid & XAVS2_CPU_SSE3) {
         INIT_PIXEL_FUNC(avg, _ssse3);
     }
+#endif
+
+    /* block average */
+    if (cpuid & XAVS2_CPU_SSE42) {
+        pixf->average = xavs2_pixel_average_sse128;
+    }
 
+#if _MSC_VER
+    if (cpuid & XAVS2_CPU_AVX) {
+        pixf->average = xavs2_pixel_average_avx;
+    }
+#endif
+
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
 #if ARCH_X86_64
         INIT_PIXEL_AVG(64, 64, avx2);
@@ -1592,20 +2422,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
         INIT_PIXEL_AVG(16,  4, avx2);
         INIT_PIXEL_AVG(16, 12, avx2);
     }
-
-    /* block average */
-    if (cpuid & XAVS2_CPU_SSE42) {
-        pixf->average = xavs2_pixel_average_sse128;
-    }
-#if _MSC_VER
-    if (cpuid & XAVS2_CPU_AVX2) {
-        pixf->average = xavs2_pixel_average_avx;
-    }
 #endif
 #endif
 
     /* init functions of block operation : copy/add/sub */
-    init_block_opreation_funcs(cpuid, pixf);
+    init_block_opreation_funcs(param, cpuid, pixf);
 
 
 #undef INIT_PIXEL_AVG
@@ -1617,9 +2438,9 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf)
 
 /* ---------------------------------------------------------------------------
  */
-static int mad_NxN_c(pel_t *p_src, int i_src, int cu_size)
+static int mad8_NxN_c(pel8_t *p_src, int i_src, int cu_size)
 {
-    pel_t *p_src_base = p_src;
+    pel8_t *p_src_base = p_src;
     int num_pix = cu_size * cu_size;
     int x, y;
     int sum = 0;
@@ -1648,23 +2469,71 @@ static int mad_NxN_c(pel_t *p_src, int i_src, int cu_size)
     return mad;
 }
 
+static int mad10_NxN_c(pel10_t *p_src, int i_src, int cu_size)
+{
+    pel10_t *p_src_base = p_src;
+    int num_pix = cu_size * cu_size;
+    int x, y;
+    int sum = 0;
+    int f_avg = 0;                 /* average of all pixels in current block */
+    int mad = 0;
+
+    /* cal average */
+    for (y = 0; y < cu_size; ++y) {
+        for (x = 0; x < cu_size; ++x) {
+            sum += p_src[x];
+        }
+        p_src += i_src;
+    }
+    f_avg = (sum + (num_pix >> 1)) / num_pix;
+
+    /* cal mad */
+    p_src = p_src_base;
+    for (y = 0; y < cu_size; ++y) {
+        for (x = 0; x < cu_size; ++x) {
+            int f_pxl = p_src[x];
+            mad += XAVS2_ABS(f_pxl - f_avg);
+        }
+        p_src += i_src;
+    }
+
+    return mad;
+}
 
 /* ---------------------------------------------------------------------------
  */
-void xavs2_mad_init(uint32_t cpuid, mad_funcs_t *madf)
+void xavs2_mad8_init(uint32_t cpuid, mad_funcs8_t *madf8)
 {
-    madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c;
-    madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c;
-    madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c;
+    madf8[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c;
+    madf8[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c;
+    madf8[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c;
 
     /* init asm function handles */
 #if HAVE_MMX
     /* functions defined in file intrinsic_mad.c */
     if (cpuid & XAVS2_CPU_SSE2) {
-        madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128;
-        madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128;
-        madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128;
+        madf8[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128;
+        madf8[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128;
+        madf8[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128;
     }
 #endif //if HAVE_MMX
 }
 
+void xavs2_mad10_init(uint32_t cpuid, mad_funcs10_t *madf10)
+{
+    madf10[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c;
+    madf10[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c;
+    madf10[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c;
+
+    /* init asm function handles */
+#if HAVE_MMX
+    /* functions defined in file intrinsic_mad.c */
+    if (cpuid & XAVS2_CPU_SSE2) {
+        madf10[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128;
+        madf10[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128;
+        madf10[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128;
+    }
+#endif //if HAVE_MMX
+}
+
+
diff --git a/source/common/pixel.h b/source/common/pixel.h
index 645b591..2630772 100644
--- a/source/common/pixel.h
+++ b/source/common/pixel.h
@@ -121,48 +121,79 @@ enum ChromaCU {
 };
 
 
-typedef cmp_dist_t(*pixel_cmp_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2);
-typedef dist_t(*pixel_ssd_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2);
-typedef dist_t(*pixel_ssd2_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height);
-typedef void(*pixel_cmp_x3_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2,                    intptr_t i_stride, int scores[3]);
-typedef void(*pixel_cmp_x4_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2, const pel_t *pix3, intptr_t i_stride, int scores[4]);
-
-typedef void(*copy_pp_t)(pel_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); // dst is aligned
-typedef void(*copy_sp_t)(pel_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
-typedef void(*copy_ps_t)(coeff_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride);
-typedef void(*copy_ss_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
-
-typedef void(*pixel_sub_ps_t)(coeff_t* dst, intptr_t dstride, const pel_t* src0, const pel_t* src1, intptr_t sstride0, intptr_t sstride1);
-typedef void(*pixel_add_ps_t)(pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);
-typedef void(*pixel_avg_pp_t)(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight);
-
-typedef int(*mad_funcs_t)(pel_t *p_src, int i_src, int cu_size);
+typedef cmp_dist_t(*pixel8_cmp_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2);
+typedef cmp_dist_t(*pixel10_cmp_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2);
+typedef dist_t(*pixel8_ssd_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2);
+typedef dist_t(*pixel10_ssd_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2);
+typedef dist_t(*pixel8_ssd2_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2, int width, int height);
+typedef dist_t(*pixel10_ssd2_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2, int width, int height);
+typedef void(*pixel8_cmp_x3_t)(const pel8_t *fenc, const pel8_t *pix0, const pel8_t *pix1, const pel8_t *pix2,                    intptr_t i_stride, int scores[3]);
+typedef void(*pixel10_cmp_x3_t)(const pel10_t *fenc, const pel10_t *pix0, const pel10_t *pix1, const pel10_t *pix2,                    intptr_t i_stride, int scores[3]);
+typedef void(*pixel8_cmp_x4_t)(const pel8_t *fenc, const pel8_t *pix0, const pel8_t *pix1, const pel8_t *pix2, const pel8_t *pix3, intptr_t i_stride, int scores[4]);
+typedef void(*pixel10_cmp_x4_t)(const pel10_t *fenc, const pel10_t *pix0, const pel10_t *pix1, const pel10_t *pix2, const pel10_t *pix3, intptr_t i_stride, int scores[4]);
+
+typedef void(*copy_pp8_t)(pel8_t* dst, intptr_t dstStride, const pel8_t* src, intptr_t srcStride); // dst is aligned
+typedef void(*copy_pp10_t)(pel10_t* dst, intptr_t dstStride, const pel10_t* src, intptr_t srcStride); // dst is aligned
+typedef void(*copy_sp8_t)(pel8_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
+typedef void(*copy_sp10_t)(pel10_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
+typedef void(*copy_ps8_t)(coeff_t* dst, intptr_t dstStride, const pel8_t* src, intptr_t srcStride);
+typedef void(*copy_ps10_t)(coeff_t* dst, intptr_t dstStride, const pel10_t* src, intptr_t srcStride);
+typedef void(*copy_ss8_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
+typedef void(*copy_ss10_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride);
+
+typedef void(*pixel_sub_ps8_t)(coeff_t* dst, intptr_t dstride, const pel8_t* src0, const pel8_t* src1, intptr_t sstride0, intptr_t sstride1);
+typedef void(*pixel_sub_ps10_t)(coeff_t* dst, intptr_t dstride, const pel10_t* src0, const pel10_t* src1, intptr_t sstride0, intptr_t sstride1);
+typedef void(*pixel_add_ps8_t)(xavs2_t *h, pel8_t* a, intptr_t dstride, const pel8_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);
+typedef void(*pixel_add_ps10_t)(xavs2_t *h, pel10_t* a, intptr_t dstride, const pel10_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);
+typedef void(*pixel_avg_pp8_t)(pel8_t* dst, intptr_t dstride, const pel8_t* src0, intptr_t sstride0, const pel8_t* src1, intptr_t sstride1, int weight);
+typedef void(*pixel_avg_pp10_t)(pel10_t* dst, intptr_t dstride, const pel10_t* src0, intptr_t sstride0, const pel10_t* src1, intptr_t sstride1, int weight);
+
+typedef int(*mad_funcs8_t)(pel8_t *p_src, int i_src, int cu_size);
+typedef int(*mad_funcs10_t)(pel10_t *p_src, int i_src, int cu_size);
 
 typedef struct {
 
-    pixel_cmp_t     sad    [NUM_PU_SIZES];
-    pixel_cmp_t     satd   [NUM_PU_SIZES];
-    pixel_cmp_t     sa8d   [NUM_PU_SIZES];
-    pixel_ssd_t     ssd    [NUM_PU_SIZES];
-    pixel_cmp_x3_t  sad_x3 [NUM_PU_SIZES];
-    pixel_cmp_x4_t  sad_x4 [NUM_PU_SIZES];
-
-    pixel_sub_ps_t  sub_ps [NUM_PU_SIZES];
-    pixel_add_ps_t  add_ps [NUM_PU_SIZES];
-    copy_sp_t       copy_sp[NUM_PU_SIZES];
-    copy_ps_t       copy_ps[NUM_PU_SIZES];
-    copy_ss_t       copy_ss[NUM_PU_SIZES];
-    copy_pp_t       copy_pp[NUM_PU_SIZES];
-    pixel_avg_pp_t  avg    [NUM_PU_SIZES];
-
-    pixel_cmp_t    *intra_cmp;  /* either satd or sad for intra mode prediction */
-    pixel_cmp_t    *fpel_cmp;   /* either satd or sad for fractional pixel comparison in ME */
-
-    mad_funcs_t     madf[CTU_DEPTH];
-
-    pixel_ssd2_t    ssd_block;
+    pixel8_cmp_t     sad8    [NUM_PU_SIZES];
+    pixel10_cmp_t     sad10   [NUM_PU_SIZES];
+    pixel8_cmp_t     satd8   [NUM_PU_SIZES];
+    pixel10_cmp_t     satd10   [NUM_PU_SIZES];
+    pixel8_cmp_t     sa8d8   [NUM_PU_SIZES];
+    pixel10_cmp_t     sa8d10   [NUM_PU_SIZES];
+    pixel8_ssd_t     ssd8    [NUM_PU_SIZES];
+    pixel10_ssd_t     ssd10    [NUM_PU_SIZES];
+    pixel8_cmp_x3_t  sad8_x3 [NUM_PU_SIZES];
+    pixel10_cmp_x3_t  sad10_x3 [NUM_PU_SIZES];
+    pixel8_cmp_x4_t  sad8_x4 [NUM_PU_SIZES];
+    pixel10_cmp_x4_t  sad10_x4 [NUM_PU_SIZES];
+
+    pixel_sub_ps8_t  sub_ps8 [NUM_PU_SIZES];
+    pixel_sub_ps10_t  sub_ps10 [NUM_PU_SIZES];
+    pixel_add_ps8_t  add_ps8 [NUM_PU_SIZES];
+    pixel_add_ps10_t  add_ps10 [NUM_PU_SIZES];
+    copy_sp8_t       copy_sp8[NUM_PU_SIZES];
+    copy_sp10_t       copy_sp10[NUM_PU_SIZES];
+    copy_ps8_t       copy_ps8[NUM_PU_SIZES];
+    copy_ps10_t       copy_ps10[NUM_PU_SIZES];
+    copy_ss8_t       copy_ss8[NUM_PU_SIZES];
+    copy_ss10_t       copy_ss10[NUM_PU_SIZES];
+    copy_pp8_t       copy_pp8[NUM_PU_SIZES];
+    copy_pp10_t       copy_pp10[NUM_PU_SIZES];
+    pixel_avg_pp8_t  avg8    [NUM_PU_SIZES];
+    pixel_avg_pp10_t  avg10    [NUM_PU_SIZES];
+
+    pixel8_cmp_t    *intra8_cmp;  /* either satd or sad for intra mode prediction */
+    pixel10_cmp_t    *intra10_cmp;  /* either satd or sad for intra mode prediction */
+    pixel8_cmp_t    *fpel8_cmp;   /* either satd or sad for fractional pixel comparison in ME */
+    pixel10_cmp_t    *fpel10_cmp;   /* either satd or sad for fractional pixel comparison in ME */
+
+    mad_funcs8_t     madf8[CTU_DEPTH];
+    mad_funcs10_t     madf10[CTU_DEPTH];
+
+    pixel8_ssd2_t    ssd_block8;
+    pixel10_ssd2_t    ssd_block10;
     /* block average */
-    void (*average)(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height);
+    void (*average8)(pel8_t *dst, int i_dst, pel8_t *src1, int i_src1, pel8_t *src2, int i_src2, int width, int height);
+    void (*average10)(pel10_t *dst, int i_dst, pel10_t *src1, int i_src1, pel10_t *src2, int i_src2, int width, int height);
 } pixel_funcs_t;
 
 
@@ -185,17 +216,26 @@ extern const uint8_t g_partition_map_tab[];
  */
 
 #define xavs2_pixel_init FPFX(pixel_init)
-void xavs2_pixel_init(uint32_t cpu, pixel_funcs_t* pixf);
+void xavs2_pixel_init(xavs2_param_t* param, uint32_t cpu, pixel_funcs_t* pixf);
 
-#define xavs2_pixel_ssd_wxh FPFX(xpixel_ssd_wxh)
-uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf,
-                             pel_t *p_pix1, intptr_t i_pix1,
-                             pel_t *p_pix2, intptr_t i_pix2,
+#define xavs2_pixel_ssd8_wxh FPFX(xpixel_ssd8_wxh)
+uint64_t xavs2_pixel_ssd8_wxh(pixel_funcs_t *pf,
+                             pel8_t *p_pix1, intptr_t i_pix1,
+                             pel8_t *p_pix2, intptr_t i_pix2,
                              int i_width, int i_height,
                              int inout_shift);
+#define xavs2_pixel_ssd10_wxh FPFX(xpixel_ssd10_wxh)
+uint64_t xavs2_pixel_ssd10_wxh(pixel_funcs_t *pf,
+                             pel10_t *p_pix1, intptr_t i_pix1,
+                             pel10_t *p_pix2, intptr_t i_pix2,
+                             int i_width, int i_height,
+                             int inout_shift);
+
 
+#define xavs2_mad8_init FPFX(mad8_init)
+void xavs2_mad8_init(uint32_t cpu, mad_funcs8_t *madf8);
 
-#define xavs2_mad_init FPFX(mad_init)
-void xavs2_mad_init(uint32_t cpu, mad_funcs_t *madf);
+#define xavs2_mad10_init FPFX(mad10_init)
+void xavs2_mad10_init(uint32_t cpu, mad_funcs10_t *madf10);
 
 #endif  // XAVS2_PIXEL_H
diff --git a/source/common/primitives.c b/source/common/primitives.c
index 07e4251..2656104 100644
--- a/source/common/primitives.c
+++ b/source/common/primitives.c
@@ -55,26 +55,29 @@ void xavs2_init_all_primitives(xavs2_param_t* param, intrinsic_func_t *p_funcs)
     uint32_t cpuid = p_funcs->cpuid;
 
     if (param != NULL) {
-        if (param->sample_bit_depth != g_bit_depth) {
-            xavs2_log(NULL, XAVS2_LOG_ERROR, "init primitives error: only %d bit-depth is supported\n", g_bit_depth);
+        if (param->sample_bit_depth != param->input_sample_bit_depth) {
+            xavs2_log(NULL, XAVS2_LOG_ERROR, "init primitives error: only %d bit-depth is supported\n", param->input_sample_bit_depth);
         }
     }
 
     /* init memory operation function handlers */
-    xavs2_mem_oper_init  (cpuid, p_funcs);
+    xavs2_mem_oper_init  (param, cpuid, p_funcs);
 
     /* init function handles */
-    xavs2_intra_pred_init(cpuid, p_funcs);
-    xavs2_mc_init        (cpuid, p_funcs);
-    xavs2_pixel_init     (cpuid, &p_funcs->pixf);
-    xavs2_deblock_init   (cpuid, p_funcs);
+    xavs2_intra_pred_init(param, cpuid, p_funcs);
+    xavs2_mc_init        (param, cpuid, p_funcs);
+    xavs2_pixel_init     (param, cpuid, &p_funcs->pixf);
+    xavs2_deblock_init   (param, cpuid, p_funcs);
     xavs2_dct_init       (cpuid, &p_funcs->dctf);
     xavs2_quant_init     (cpuid, &p_funcs->dctf);
     xavs2_cg_scan_init   (cpuid, p_funcs);
-    xavs2_mad_init       (cpuid, p_funcs->pixf.madf);
-
-    xavs2_sao_init       (cpuid, p_funcs);
-    xavs2_alf_init       (cpuid, p_funcs);
+    if (param->input_sample_bit_depth == 8) {
+    xavs2_mad8_init       (cpuid, p_funcs->pixf.madf8);
+    } else {
+    xavs2_mad10_init       (cpuid, p_funcs->pixf.madf10);
+    }
+    xavs2_sao_init       (param, cpuid, p_funcs);
+    xavs2_alf_init       (param, cpuid, p_funcs);
 
     xavs2_rdo_init       (cpuid, p_funcs);
 }
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 7c690fe..0847cc1 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -57,39 +57,57 @@ typedef void *(*memcpy_t)(void *dst, const void *src, size_t n);
 /* ---------------------------------------------------------------------------
  * inter prediction
  */
-typedef void(*block_copy_t   )(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h);
-typedef void(*plane_copy_di_t)(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h);
-typedef void(*intpl_t        )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
-typedef void(*intpl_ext_t    )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y);
-
-typedef void(*intpl_luma_hor_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
-typedef void(*intpl_luma_ext_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
-typedef void(*intpl_luma_ver_t)(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff);
-
-typedef void(*intpl_luma_ver_x3_t)(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff);
-typedef void(*intpl_luma_hor_x3_t)(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
-typedef void(*intpl_luma_ext_x3_t)(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
-
-typedef void (*filter_pp_t)    (const pel_t   *src, intptr_t srcStride, pel_t   *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hps_t)   (const pel_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-typedef void (*filter_ps_t)    (const pel_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_sp_t)    (const int16_t *src, intptr_t srcStride, pel_t   *dst, intptr_t dstStride, int coeffIdx);
+typedef void(*block_copy8_t   )(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h);
+typedef void(*block_copy10_t   )(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h);
+typedef void(*plane_copy8_di_t)(xavs2_t *bb, pel8_t *dstu, intptr_t i_dstu, pel8_t *dstv, intptr_t i_dstv, pel8_t *src, intptr_t i_src, int w, int h);
+typedef void(*plane_copy10_di_t)(xavs2_t *bb, pel10_t *dstu, intptr_t i_dstu, pel10_t *dstv, intptr_t i_dstv, pel10_t *src, intptr_t i_src, int w, int h);
+typedef void(*intpl8_t        )(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff);
+typedef void(*intpl10_t        )(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff);
+typedef void(*intpl8_ext_t    )(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y);
+typedef void(*intpl10_ext_t    )(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y);
+
+typedef void(*intpl_luma8_hor_t)(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t *coeff);
+typedef void(*intpl_luma10_hor_t)(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t *coeff);
+typedef void(*intpl_luma8_ext_t)(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
+typedef void(*intpl_luma10_ext_t)(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
+typedef void(*intpl_luma8_ver_t)(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff);
+typedef void(*intpl_luma10_ver_t)(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff);
+
+typedef void(*intpl_luma8_ver_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const **coeff);
+typedef void(*intpl_luma10_ver_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const **coeff);
+typedef void(*intpl_luma8_hor_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t **coeff);
+typedef void(*intpl_luma10_hor_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t **coeff);
+typedef void(*intpl_luma8_ext_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
+typedef void(*intpl_luma10_ext_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
+
+typedef void (*filter_pp8_t)    (const pel8_t   *src, intptr_t srcStride, pel8_t   *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_pp10_t)    (const pel10_t   *src, intptr_t srcStride, pel10_t   *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hps8_t)   (const pel8_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+typedef void (*filter_hps10_t)   (const pel10_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+typedef void (*filter_ps8_t)    (const pel8_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_ps10_t)    (const pel10_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_sp8_t)    (const int16_t *src, intptr_t srcStride, pel8_t   *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_sp10_t)    (const int16_t *src, intptr_t srcStride, pel10_t   *dst, intptr_t dstStride, int coeffIdx);
 typedef void (*filter_ss_t)    (const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx);
-typedef void (*filter_hv_pp_t) (const pel_t   *src, intptr_t srcStride, pel_t   *dst, intptr_t dstStride, int idxX, int idxY);
-typedef void (*filter_p2s_t)   (const pel_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride);
+typedef void (*filter_hv_pp8_t) (const pel8_t   *src, intptr_t srcStride, pel8_t   *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_hv_pp10_t) (const pel10_t   *src, intptr_t srcStride, pel10_t   *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s8_t)   (const pel8_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride);
+typedef void (*filter_p2s10_t)   (const pel10_t   *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride);
 
 /* ---------------------------------------------------------------------------
  * intra prediction
  */
-typedef void(*intra_pred_t)(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
-typedef void(*fill_edge_t) (const pel_t *p_topleft, int i_topleft, const pel_t *p_lcu_ep, pel_t *ep, uint32_t i_avail, int bsx, int bsy);
+typedef void(*intra8_pred_t)(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+typedef void(*intra10_pred_t)(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+typedef void(*fill_edge8_t) (xavs2_t *h, const pel8_t *p_topleft, int i_topleft, const pel8_t *p_lcu_ep, pel8_t *ep, uint32_t i_avail, int bsx, int bsy);
+typedef void(*fill_edge10_t) (xavs2_t *h, const pel10_t *p_topleft, int i_topleft, const pel10_t *p_lcu_ep, pel10_t *ep, uint32_t i_avail, int bsx, int bsy);
 typedef void(*fill_ref_samples_t)(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy);
 
 
 /* ---------------------------------------------------------------------------
  * transform and quantization functions
  */
-typedef void(*dct_t)(const coeff_t *src, coeff_t *dst, int i_src);
+typedef void(*dct_t)(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 
 /* ---------------------------------------------------------------------------
  * coefficient scan
@@ -102,11 +120,11 @@ typedef struct {
     /* dct */
     dct_t         dct[NUM_PU_SIZES];
     dct_t        idct[NUM_PU_SIZES];
-    dct_t        dct_half[NUM_PU_SIZES];   // Ö»Çó½âDCT¾ØÕóµÄµÍÆµÏµÊý
+    dct_t        dct_half[NUM_PU_SIZES];   // åªæ±‚è§£DCTçŸ©é˜µçš„ä½Žé¢‘ç³»æ•°
 
     /* 2nd transform */
-    void(*transform_4x4_2nd)    (coeff_t *coeff, int i_coeff);
-    void(*inv_transform_4x4_2nd)(coeff_t *coeff, int i_coeff);
+    void(*transform_4x4_2nd)    (xavs2_t *h, coeff_t *coeff, int i_coeff);
+    void(*inv_transform_4x4_2nd)(xavs2_t *h, coeff_t *coeff, int i_coeff);
     void(*transform_2nd)        (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left);
     void(*inv_transform_2nd)    (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left);
 
@@ -120,11 +138,12 @@ typedef struct {
 
 
 /* SAO filter function */
-typedef void(*sao_flt_t)(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+typedef void(*sao_flt8_t)(xavs2_t* h,pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
+                         int i_block_w, int i_block_h,
+                         int *lcu_avail, SAOBlkParam *sao_param);
+typedef void(*sao_flt10_t)(xavs2_t* h,pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
                          int i_block_w, int i_block_h,
                          int *lcu_avail, SAOBlkParam *sao_param);
-
-
 
 /* ---------------------------------------------------------------------------
  */
@@ -137,7 +156,8 @@ typedef struct intrinsic_func_t {
     void*(*fast_memset)(void *dst, int val, size_t n);
     void (*mem_repeat_i)(void *dst, int val, size_t count);
     void*(*mem_repeat_p)(void *dst, int val, size_t count);
-    void (*lowres_filter)(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height);
+    void (*lowres_filter8)(xavs2_t *h, pel8_t *src, int i_src, pel8_t *dst, int i_dst, int width, int height);
+    void (*lowres_filter10)(xavs2_t *h, pel10_t *src, int i_src, pel10_t *dst, int i_dst, int width, int height);
 
     pixel_funcs_t       pixf;
 
@@ -145,46 +165,73 @@ typedef struct intrinsic_func_t {
      * block copy
      */
     /* align copy */
-    block_copy_t        align_copy;
+    block_copy8_t        align_copy8;
+    block_copy10_t        align_copy10;
 
     /* plane copy */
-    block_copy_t        plane_copy;
-    plane_copy_di_t     plane_copy_deinterleave;
+    block_copy8_t        plane_copy8;
+    block_copy10_t        plane_copy10;
+    plane_copy8_di_t     plane_copy8_deinterleave;
+    plane_copy10_di_t     plane_copy10_deinterleave;
 
     /* ---------------------------------------------------------------------------
      * Motion Compensation
      */
-    intpl_luma_hor_t    intpl_luma_hor;
-    intpl_luma_ver_t    intpl_luma_ver;
-    intpl_luma_ext_t    intpl_luma_ext;
-
-    intpl_luma_ver_x3_t intpl_luma_ver_x3;
-    intpl_luma_hor_x3_t intpl_luma_hor_x3;
-    intpl_luma_ext_x3_t intpl_luma_ext_x3;
-
-    intpl_t             intpl_luma_block_hor;
-    intpl_t             intpl_luma_block_ver;
-    intpl_ext_t         intpl_luma_block_ext;
-
-    intpl_t             intpl_chroma_block_hor;
-    intpl_t             intpl_chroma_block_ver;
-    intpl_ext_t         intpl_chroma_block_ext;
-
-    struct inter_pred_t {
-        filter_pp_t     luma_hpp;    // 8-tap luma motion compensation interpolation filters
-        filter_hps_t    luma_hps;
-        filter_pp_t     luma_vpp;
-        filter_ps_t     luma_vps;
-        filter_sp_t     luma_vsp;
-        filter_ss_t     luma_vss;
-        filter_hv_pp_t  luma_hvpp;   // combines hps + vsp
-    } intpl[NUM_PU_SIZES];
+    intpl_luma8_hor_t    intpl_luma8_hor;
+    intpl_luma10_hor_t    intpl_luma10_hor;
+    intpl_luma8_ver_t    intpl_luma8_ver;
+    intpl_luma10_ver_t    intpl_luma10_ver;
+    intpl_luma8_ext_t    intpl_luma8_ext;
+    intpl_luma10_ext_t    intpl_luma10_ext;
+
+    intpl_luma8_ver_x3_t intpl_luma8_ver_x3;
+    intpl_luma10_ver_x3_t intpl_luma10_ver_x3;
+    intpl_luma8_hor_x3_t intpl_luma8_hor_x3;
+    intpl_luma10_hor_x3_t intpl_luma10_hor_x3;
+    intpl_luma8_ext_x3_t intpl_luma8_ext_x3;
+    intpl_luma10_ext_x3_t intpl_luma10_ext_x3;
+
+    intpl8_t             intpl_luma8_block_hor;
+    intpl10_t             intpl_luma10_block_hor;
+    intpl8_t             intpl_luma8_block_ver;
+    intpl10_t             intpl_luma10_block_ver;
+    intpl8_ext_t         intpl_luma8_block_ext;
+    intpl10_ext_t         intpl_luma10_block_ext;
+
+    intpl8_t             intpl_chroma8_block_hor;
+    intpl10_t             intpl_chroma10_block_hor;
+    intpl8_t             intpl_chroma8_block_ver;
+    intpl10_t             intpl_chroma10_block_ver;
+    intpl8_ext_t         intpl_chroma8_block_ext;
+    intpl10_ext_t         intpl_chroma10_block_ext;
+
+    struct inter_pred8_t {
+        filter_pp8_t     luma_hpp8;    // 8-tap luma motion compensation interpolation filters
+        filter_hps8_t    luma_hps8;
+        filter_pp8_t     luma_vpp8;
+        filter_ps8_t     luma_vps8;
+        filter_sp8_t     luma_vsp8;
+        filter_ss_t     luma_vss8;
+        filter_hv_pp8_t  luma_hvpp8;   // combines hps + vsp
+    } intpl8[NUM_PU_SIZES];
+
+    struct inter_pred10_t {
+        filter_pp10_t     luma_hpp10;    // 8-tap luma motion compensation interpolation filters
+        filter_hps10_t    luma_hps10;
+        filter_pp10_t     luma_vpp10;
+        filter_ps10_t     luma_vps10;
+        filter_sp10_t     luma_vsp10;
+        filter_ss_t     luma_vss10;
+        filter_hv_pp10_t  luma_hvpp10;   // combines hps + vsp
+    } intpl10[NUM_PU_SIZES];
 
     /* ---------------------------------------------------------------------------
      * intra prediction
      */
-    intra_pred_t        intraf[NUM_INTRA_MODE];
-    fill_edge_t         fill_edge_f[4];   /* 0, x, y, xy */
+    intra8_pred_t        intraf8[NUM_INTRA_MODE];
+    intra10_pred_t        intraf10[NUM_INTRA_MODE];
+    fill_edge8_t         fill_edge8_f[4];   /* 0, x, y, xy */
+    fill_edge10_t         fill_edge10_f[4];   /* 0, x, y, xy */
     fill_ref_samples_t  fill_ref_luma[2]; /* 0: CU inside picture; 1: on right/bottom */
 
     /* ---------------------------------------------------------------------------
@@ -199,16 +246,25 @@ typedef struct intrinsic_func_t {
     /* ---------------------------------------------------------------------------
      * In-loop filter
      */
-    void(*deblock_luma[2])(pel_t *, int, int, int, uint8_t*);
-    void(*deblock_chroma[2])(pel_t *, pel_t *, int, int, int, uint8_t*);
+    void(*deblock_luma8[2])(xavs2_t *, pel8_t *, int, int, int, uint8_t*);
+    void(*deblock_chroma8[2])(xavs2_t *, pel8_t *, pel8_t *, int, int, int, uint8_t*);
+    void(*deblock_luma10[2])(xavs2_t *, pel10_t *, int, int, int, uint8_t*);
+    void(*deblock_chroma10[2])(xavs2_t *, pel10_t *, pel10_t *, int, int, int, uint8_t*);
 
-    void(*deblock_luma_double[2])  (pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag);
-    void(*deblock_chroma_double[2])(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag);
+    void(*deblock_luma8_double[2])  (pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag);
+    void(*deblock_chroma8_double[2])(pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag);
+    void(*deblock_luma10_double[2])  (pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag);
+    void(*deblock_chroma10_double[2])(pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag);
 
-    sao_flt_t       sao_block;          /* filter for SAO */
+    sao_flt8_t       sao_block8;          /* filter for SAO */
+    sao_flt10_t       sao_block10;          /* filter for SAO */
 
     /* function handles */
-    void(*alf_flt[2])(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+    void(*alf_flt8[2])(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
+                      int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+                      int *alf_coeff, int b_top_avail, int b_down_avail);
+
+    void(*alf_flt10[2])(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
                       int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
                       int *alf_coeff, int b_top_avail, int b_down_avail);
 
@@ -228,16 +284,17 @@ extern intrinsic_func_t g_funcs;
  * ===========================================================================
  */
 #define xavs2_mem_oper_init FPFX(mem_oper_init)
-void xavs2_mem_oper_init    (uint32_t cpuid, intrinsic_func_t *pf);
+void xavs2_mem_oper_init    (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf);
 
 #define xavs2_mc_init FPFX(mc_init)
-void xavs2_mc_init          (uint32_t cpuid, intrinsic_func_t *pf);
+void xavs2_mc_init          (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf);
 
 #define xavs2_intra_pred_init FPFX(intra_pred_init)
-void xavs2_intra_pred_init  (uint32_t cpuid, intrinsic_func_t *pf);
+void xavs2_intra_pred_init  (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf);
 
 #define xavs2_dct_init FPFX(dct_init)
 void xavs2_dct_init         (uint32_t cpuid, dct_funcs_t *dctf);
+
 #define xavs2_quant_init FPFX(quant_init)
 void xavs2_quant_init       (uint32_t cpuid, dct_funcs_t *quantf);
 
@@ -245,12 +302,13 @@ void xavs2_quant_init       (uint32_t cpuid, dct_funcs_t *quantf);
 void xavs2_cg_scan_init     (uint32_t cpuid, intrinsic_func_t *pf);
 
 #define xavs2_deblock_init FPFX(deblock_init)
-void xavs2_deblock_init     (uint32_t cpuid, intrinsic_func_t* lf);
+void xavs2_deblock_init     (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t* lf);
 
 #define xavs2_sao_init FPFX(sao_init)
-void xavs2_sao_init         (uint32_t cpuid, intrinsic_func_t *pf);
+void xavs2_sao_init         (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf);
+
 #define xavs2_alf_init FPFX(alf_init)
-void xavs2_alf_init         (uint32_t cpuid, intrinsic_func_t *pf);
+void xavs2_alf_init         (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf);
 
 #define xavs2_rdo_init FPFX(rdo_init)
 void xavs2_rdo_init         (uint32_t cpuid, intrinsic_func_t *pf);
diff --git a/source/common/quant.c b/source/common/quant.c
index f88f004..5eb3ae5 100644
--- a/source/common/quant.c
+++ b/source/common/quant.c
@@ -228,6 +228,7 @@ void xavs2_quant_init(uint32_t cpuid, dct_funcs_t *dctf)
         dctf->add_sign  = add_sign_sse128;
     }
 
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
         dctf->quant     = quant_c_avx2;
         dctf->dequant   = dequant_c_avx2;
@@ -241,6 +242,7 @@ void xavs2_quant_init(uint32_t cpuid, dct_funcs_t *dctf)
         dctf->dequant   = FPFX(dequant_avx2);
 #endif
     }
+#endif
 #else
     UNUSED_PARAMETER(cpuid);
 #endif  // if HAVE_MMX
diff --git a/source/common/threadpool.c b/source/common/threadpool.c
index 34bd8f7..9fe742d 100644
--- a/source/common/threadpool.c
+++ b/source/common/threadpool.c
@@ -300,7 +300,7 @@ int xavs2_threadpool_init(xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_
     if (xavs2_sync_job_list_init(&pool->uninit, pool->i_threads) ||
         xavs2_sync_job_list_init(&pool->run,    pool->i_threads) ||
         xavs2_sync_job_list_init(&pool->done,   pool->i_threads)) {
-        goto fail;
+        goto fail8;
     }
 
     for (i = 0; i < pool->i_threads; i++) {
@@ -313,13 +313,13 @@ int xavs2_threadpool_init(xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_
 
     for (i = 0; i < pool->i_threads; i++) {
         if (xavs2_create_thread(pool->thread_handle + i, (xavs2_tfunc_t)proc_xavs2_threadpool_thread, pool)) {
-            goto fail;
+            goto fail8;
         }
     }
 
     return 0;
 
-fail:
+fail8:
     return -1;
 }
 
diff --git a/source/common/transform.c b/source/common/transform.c
index 250545d..f282115 100644
--- a/source/common/transform.c
+++ b/source/common/transform.c
@@ -1061,9 +1061,9 @@ static void xTr2nd_4_1d_Inv_Hor(coeff_t *coeff, int i_coeff, int i_shift, int cl
 
 /* ---------------------------------------------------------------------------
  */
-static void transform_4x4_2nd_c(coeff_t *coeff, int i_coeff)
+static void transform_4x4_2nd_c(xavs2_t *h, coeff_t *coeff, int i_coeff)
 {
-    const int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1;
+    const int shift1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + 1;
     const int shift2 = B4X4_IN_BIT + FACTO_BIT + 1;
 
     xTr2nd_4_1d_Hor(coeff, i_coeff, shift1, g_2T_C);
@@ -1072,11 +1072,11 @@ static void transform_4x4_2nd_c(coeff_t *coeff, int i_coeff)
 
 /* ---------------------------------------------------------------------------
  */
-static void inv_transform_4x4_2nd_c(coeff_t *coeff, int i_coeff)
+static void inv_transform_4x4_2nd_c(xavs2_t *h, coeff_t *coeff, int i_coeff)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth + 2;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int shift2 = 20 - h->param->input_sample_bit_depth + 2;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, shift1, g_2T_C);
     xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, shift2, clip_depth2, g_2T_C);
@@ -1120,12 +1120,12 @@ static void inv_transform_2nd_c(coeff_t *coeff, int i_coeff, int i_mode, int b_t
 
 /* ---------------------------------------------------------------------------
  */
-static void dct_4x4_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_4x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE   4
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
-    int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;   // 0
+    int shift1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;   // 0
     int shift2 = B4X4_IN_BIT + FACTO_BIT;                               // 7
     int i;
 
@@ -1142,15 +1142,15 @@ static void dct_4x4_c(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_4x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE   4
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth;
+    int shift2 = 20 - h->param->input_sample_bit_depth;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1;
     int i;
 
     partialButterflyInverse4(src,   coeff, shift1, BSIZE, clip_depth1);
@@ -1164,12 +1164,12 @@ static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-static void dct_8x8_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_8x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE   8
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
-    int shift1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    int shift1 = B8X8_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     int shift2 = B8X8_IN_BIT + FACTO_BIT;
     int i;
 
@@ -1184,15 +1184,15 @@ static void dct_8x8_c(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_8x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE   8
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth;
+    int shift2 = 20 - h->param->input_sample_bit_depth;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1;
     int i;
 
     partialButterflyInverse8(src,   coeff, shift1, BSIZE, clip_depth1);
@@ -1206,12 +1206,12 @@ static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-static void dct_16x16_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_16x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE   16
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
-    int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     int shift2 = B16X16_IN_BIT + FACTO_BIT;
     int i;
 
@@ -1226,15 +1226,15 @@ static void dct_16x16_c(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_16x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE   16
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth;
+    int shift2 = 20 - h->param->input_sample_bit_depth;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1;
     int i;
 
     partialButterflyInverse16(src,   coeff, shift1, BSIZE, clip_depth1);
@@ -1250,12 +1250,12 @@ static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_32x32_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_32x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE   32
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
-    int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
+    int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
     int shift2 = B32X32_IN_BIT + FACTO_BIT;
     int i;
 
@@ -1273,10 +1273,10 @@ static void dct_32x32_c(const coeff_t *src, coeff_t *dst, int i_src)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_32x32_half_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_32x32_half_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
     int i;
-    dct_32x32_c(src, dst, i_src);
+    dct_32x32_c(h, src, dst, i_src);
 
     for (i = 0; i < 16; i++) {
         memset(dst + 16, 0, 16 * sizeof(coeff_t));
@@ -1289,16 +1289,16 @@ static void dct_32x32_half_c(const coeff_t *src, coeff_t *dst, int i_src)
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_32x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE   32
     ALIGN32(coeff_t coeff[BSIZE * BSIZE]);
     ALIGN32(coeff_t block[BSIZE * BSIZE]);
     int a_flag = i_dst & 0x01;
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - a_flag;
+    int shift2 = 20 - h->param->input_sample_bit_depth - a_flag;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + a_flag;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + a_flag;
     int i;
 
     i_dst &= 0xFE;    /* remember to remove the flag bit */
@@ -1313,13 +1313,13 @@ static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-static void dct_16x4_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_16x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE_H   16
 #define BSIZE_V   4
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
-    int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     int shift2 = B16X16_IN_BIT + FACTO_BIT - 2;
     int i;
 
@@ -1335,16 +1335,16 @@ static void dct_16x4_c(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_16x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE_H   16
 #define BSIZE_V   4
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth;
+    int shift2 = 20 - h->param->input_sample_bit_depth;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1;
     int i;
 
     partialButterflyInverse4 (src,   coeff, shift1, BSIZE_H, clip_depth1);
@@ -1359,13 +1359,13 @@ static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-static void dct_4x16_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_4x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE_H   4
 #define BSIZE_V   16
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
-    int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2;
+    int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2;
     int shift2 = B16X16_IN_BIT + FACTO_BIT;
     int i;
 
@@ -1381,16 +1381,16 @@ static void dct_4x16_c(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_4x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE_H   4
 #define BSIZE_V   16
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth;
+    int shift2 = 20 - h->param->input_sample_bit_depth;
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1;
     int i;
 
     partialButterflyInverse16(src,   coeff, shift1, BSIZE_H, clip_depth1);
@@ -1407,13 +1407,13 @@ static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_32x8_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_32x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE_H   32
 #define BSIZE_V   8
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
-    int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01);
     int i;
 
@@ -1432,16 +1432,16 @@ static void dct_32x8_c(const coeff_t *src, coeff_t *dst, int i_src)
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_32x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE_H   32
 #define BSIZE_V   8
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - (i_dst & 0x01);
+    int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01);
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01);
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01);
     int i;
 
     partialButterflyInverse8 (src,   coeff, shift1, BSIZE_H, clip_depth1);
@@ -1459,13 +1459,13 @@ static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_8x32_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_8x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
 #define BSIZE_H   8
 #define BSIZE_V   32
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
-    int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01);
+    int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01);
     int shift2 = B32X32_IN_BIT + FACTO_BIT;
     int i;
 
@@ -1484,16 +1484,16 @@ static void dct_8x32_c(const coeff_t *src, coeff_t *dst, int i_src)
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_8x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
 #define BSIZE_H   8
 #define BSIZE_V   32
     ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]);
     ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]);
     int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - (i_dst & 0x01);
+    int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01);
     int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01);
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01);
     int i;
 
     partialButterflyInverse32(src,   coeff, shift1, BSIZE_H, clip_depth1);
@@ -1511,22 +1511,22 @@ static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_64x64_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_64x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(i_src);
     wavelet_64x64_c(src, dst);
-    dct_32x32_c(dst, dst, 32 | 0x01);  /* 32x32 dct */
+    dct_32x32_c(h, dst, dst, 32 | 0x01);  /* 32x32 dct */
 }
 
 /* ---------------------------------------------------------------------------
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_64x64_half_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_64x64_half_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(i_src);
     wavelet_64x64_c(src, dst);
-    dct_32x32_half_c(dst, dst, 32 | 0x01);  /* 32x32 dct */
+    dct_32x32_half_c(h, dst, dst, 32 | 0x01);  /* 32x32 dct */
 }
 
 
@@ -1534,10 +1534,10 @@ static void dct_64x64_half_c(const coeff_t *src, coeff_t *dst, int i_src)
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_64x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_32x32_c(src, dst, 32 | 0x01); /* 32x32 idct */
+    idct_32x32_c(h, src, dst, 32 | 0x01); /* 32x32 idct */
     inv_wavelet_64x64_c(dst);
 }
 
@@ -1545,21 +1545,21 @@ static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_64x16_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_64x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(i_src);
     wavelet_64x16_c(src, dst);
-    dct_32x8_c(dst, dst, 32 | 0x01);
+    dct_32x8_c(h, dst, dst, 32 | 0x01);
 }
 
 /* ---------------------------------------------------------------------------
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_64x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_32x8_c(src, dst, 32 | 0x01);
+    idct_32x8_c(h, src, dst, 32 | 0x01);
     inv_wavelet_64x16_c(dst);
 }
 
@@ -1568,21 +1568,21 @@ static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst)
  * NOTE:
  * i_src - the stride of src (the lowest bit is additional wavelet flag)
  */
-static void dct_16x64_c(const coeff_t *src, coeff_t *dst, int i_src)
+static void dct_16x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(i_src);
     wavelet_16x64_c(src, dst);
-    dct_8x32_c(dst, dst, 8 | 0x01);
+    dct_8x32_c(h, dst, dst, 8 | 0x01);
 }
 
 /* ---------------------------------------------------------------------------
  * NOTE:
  * i_dst - the stride of dst (the lowest bit is additional wavelet flag)
  */
-static void idct_16x64_c(const coeff_t *src, coeff_t *dst, int i_dst)
+static void idct_16x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_8x32_c(src, dst, 8 | 0x01);
+    idct_8x32_c(h, src, dst, 8 | 0x01);
     inv_wavelet_16x64_c(dst);
 }
 
@@ -1658,7 +1658,7 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf)
 
         /* dct: asymmetrical */
         dctf->dct[LUMA_16x4 ] = dct_c_16x4_sse128;
-        dctf->dct[LUMA_4x16 ] = dct_c_4x16_sse128;//µÚÒ»´Î±ä»»Ã»Ð´ÒÆÎ»
+        dctf->dct[LUMA_4x16 ] = dct_c_4x16_sse128;//ç¬¬ä¸€æ¬¡å˜æ¢æ²¡å†™ç§»ä½
         dctf->dct[LUMA_32x8 ] = dct_c_32x8_sse128;
         dctf->dct[LUMA_8x32 ] = dct_c_8x32_sse128;
         dctf->dct[LUMA_64x16] = dct_c_64x16_sse128;
@@ -1708,8 +1708,8 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf)
         dctf->dct[LUMA_8x8   ] = xavs2_dct_8x8_sse4;
     }
 
+#if defined(__AVX2__)
     if (cpuid & XAVS2_CPU_AVX2) {
-
         dctf->dct [LUMA_4x4   ] = xavs2_dct_4x4_avx2;
 #if ARCH_X86_64
         dctf->dct [LUMA_8x8   ] = xavs2_dct_8x8_avx2;
@@ -1723,13 +1723,12 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf)
 #endif
     }
 
-
 #if ARCH_X86_64
     if (cpuid & XAVS2_CPU_AVX2) {
-        // dctf->dct[LUMA_4x4 ] = dct_c_4x4_avx2;   /* futl: dct_4x4_avx2µÄËÙ¶È±Èdct_4x4_sse128ÂÔÂýÒ»µã */
-        // dctf->dct[LUMA_8x8 ] = dct_c_8x8_avx2;   /* futl: dct_8x8_avx2µÄËÙ¶È±Èxavs2_dct_8x8_avx2Âý */
-        // dctf->dct[LUMA_4x16] = dct_c_4x16_avx2; /* futl: dct_4x16_avx2µÄËÙ¶È±Èdct_4x16_sse128Âý */
-        dctf->dct[LUMA_16x4 ] = dct_c_16x4_avx2;   /* ½ª²¨£ºËÙ¶È±Èsse128¿ìÁ½±¶ */
+        // dctf->dct[LUMA_4x4 ] = dct_c_4x4_avx2;   /* futl: dct_4x4_avx2çš„é€Ÿåº¦æ¯”dct_4x4_sse128ç•¥æ…¢ä¸€ç‚¹ */
+        // dctf->dct[LUMA_8x8 ] = dct_c_8x8_avx2;   /* futl: dct_8x8_avx2çš„é€Ÿåº¦æ¯”xavs2_dct_8x8_avx2æ…¢ */
+        // dctf->dct[LUMA_4x16] = dct_c_4x16_avx2; /* futl: dct_4x16_avx2çš„é€Ÿåº¦æ¯”dct_4x16_sse128æ…¢ */
+        dctf->dct[LUMA_16x4 ] = dct_c_16x4_avx2;   /* å§œæ³¢ï¼šé€Ÿåº¦æ¯”sse128å¿«ä¸¤å€ */
         dctf->dct[LUMA_8x32 ] = dct_c_8x32_avx2;
         dctf->dct[LUMA_32x8 ] = dct_c_32x8_avx2;
         dctf->dct[LUMA_16x16] = dct_c_16x16_avx2;
@@ -1751,6 +1750,7 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf)
         dctf->dct_half[LUMA_64x64] = dct_c_64x64_half_avx2;
     }
 #endif  // ARCH_X86_64
+#endif
 #else
     UNUSED_PARAMETER(cpuid);
 #endif  // if HAVE_MMX
diff --git a/source/common/vec/intrinsic.h b/source/common/vec/intrinsic.h
index 27a68ac..ec0a1bd 100644
--- a/source/common/vec/intrinsic.h
+++ b/source/common/vec/intrinsic.h
@@ -47,7 +47,7 @@
 #define M128_I16(mx, idx)  _mm_extract_epi16(mx, idx)
 
 
-#if _MSC_VER // ½â¾övsÏÂimmintrin.hÖÐÃ»ÓÐ¶¨ÒåÕâÐ©º¯ÊýµÄÎÊÌâ
+#if _MSC_VER // è§£å†³vsä¸‹immintrin.hä¸­æ²¡æœ‰å®šä¹‰è¿™äº›å‡½æ•°çš„é—®é¢˜
 #define _mm256_extract_epi64(a, i) (a.m256i_i64[i])
 #define _mm256_extract_epi32(a, i) (a.m256i_i32[i])
 #define _mm256_extract_epi16(a, i) (a.m256i_i16[i])
@@ -61,7 +61,7 @@
 #define _mm256_insert_epi16(a, value, index) (a.m256i_i16[index] = value)
 #define _mm256_insert_epi8 (a, value, index) (a.m256i_i8 [index] = value)
 #else
-// Ìí¼Ó²¿·ÖgccÖÐÈ±ÉÙµÄavxº¯Êý¶¨Òå
+// æ·»åŠ éƒ¨åˆ†gccä¸­ç¼ºå°‘çš„avxå‡½æ•°å®šä¹‰
 #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
             _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
 #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
@@ -98,40 +98,43 @@ ALIGN16(extern const int8_t tab_coeff_mode_11[64][16]);
 #define intpl_copy_block_sse128 FPFX(intpl_copy_block_sse128)
 void intpl_copy_block_sse128      (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height);
 #define intpl_luma_block_hor_sse128 FPFX(intpl_luma_block_hor_sse128)
-void intpl_luma_block_hor_sse128  (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_luma_block_hor_sse128  (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_luma_block_ver_sse128 FPFX(intpl_luma_block_ver_sse128)
-void intpl_luma_block_ver_sse128  (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_luma_block_ver_sse128  (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_luma_block_ext_sse128 FPFX(intpl_luma_block_ext_sse128)
-void intpl_luma_block_ext_sse128  (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v);
+void intpl_luma_block_ext_sse128  (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v);
 
 #define intpl_luma_hor_sse128 FPFX(intpl_luma_hor_sse128)
-void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_luma_hor_sse128(xavs2_t *h, pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_luma_ver_sse128 FPFX(intpl_luma_ver_sse128)
-void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_luma_ver_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_luma_ext_sse128 FPFX(intpl_luma_ext_sse128)
-void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
+void intpl_luma_ext_sse128(xavs2_t *h, pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
 
+#if defined(__AVX2__)
 #define intpl_luma_hor_avx2 FPFX(intpl_luma_hor_avx2)
 void intpl_luma_hor_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff);
 #define intpl_luma_ver_avx2 FPFX(intpl_luma_ver_avx2)
 void intpl_luma_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff);
 #define intpl_luma_ext_avx2 FPFX(intpl_luma_ext_avx2)
 void intpl_luma_ext_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff);
+#endif
 
 #define intpl_luma_hor_x3_sse128 FPFX(intpl_luma_hor_x3_sse128)
-void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
+void intpl_luma_hor_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
 #define intpl_luma_ver_x3_sse128 FPFX(intpl_luma_ver_x3_sse128)
-void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
+void intpl_luma_ver_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
 #define intpl_luma_ext_x3_sse128 FPFX(intpl_luma_ext_x3_sse128)
-void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
+void intpl_luma_ext_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
 
 #define intpl_chroma_block_hor_sse128 FPFX(intpl_chroma_block_hor_sse128)
-void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_chroma_block_hor_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_chroma_block_ver_sse128 FPFX(intpl_chroma_block_ver_sse128)
-void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
+void intpl_chroma_block_ver_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_chroma_block_ext_sse128 FPFX(intpl_chroma_block_ext_sse128)
-void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v);
+void intpl_chroma_block_ext_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v);
 
+#if defined(__AVX2__)
 #define intpl_luma_block_hor_avx2 FPFX(intpl_luma_block_hor_avx2)
 void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff);
 #define intpl_luma_block_ver_avx2 FPFX(intpl_luma_block_ver_avx2)
@@ -152,6 +155,7 @@ void intpl_luma_hor_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *const tmp[3],
 void intpl_luma_ver_x3_avx2(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff);
 #define intpl_luma_ext_x3_avx2 FPFX(intpl_luma_ext_x3_avx2)
 void intpl_luma_ext_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff);
+#endif
 
 /* memory operation */
 #define cpy_pel_I420_to_uchar_YUY2_sse128 FPFX(cpy_pel_I420_to_uchar_YUY2_sse128)
@@ -160,12 +164,13 @@ void cpy_pel_I420_to_uchar_YUY2_sse128(const pel_t *srcy, const pel_t *srcu, con
 void add_pel_clip_sse128(const pel_t *src1, int i_src1, const int16_t *src2, int i_src2, pel_t *dst, int i_dst, int width, int height, int bit_depth);
 #define xavs2_pixel_average_sse128 FPFX(pixel_average_sse128)
 void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height);
-#define xavs2_pixel_average_avx FPFX(pixel_average_avx)
-void xavs2_pixel_average_avx   (pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height);
 #define padding_rows_sse128 FPFX(padding_rows_sse128)
 void padding_rows_sse128   (pel_t *src, int i_src, int width, int height, int start, int rows, int pad);
 #define padding_rows_lr_sse128 FPFX(padding_rows_lr_sse128)
 void padding_rows_lr_sse128(pel_t *src, int i_src, int width, int height, int start, int rows, int pad);
+#if defined(__AVX2__)
+#define xavs2_pixel_average_avx FPFX(pixel_average_avx)
+void xavs2_pixel_average_avx   (pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height);
 #define padding_rows_sse256 FPFX(padding_rows_sse256)
 void padding_rows_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad);
 #define padding_rows_sse256_10bit FPFX(padding_rows_sse256_10bit)
@@ -174,15 +179,15 @@ void padding_rows_sse256_10bit(pel_t *src, int i_src, int width, int height, int
 void padding_rows_lr_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad);
 #define padding_rows_lr_sse256_10bit FPFX(padding_rows_lr_sse256)
 void padding_rows_lr_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad);
-
-#define xavs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2)
-void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n);
 #define xavs2_memzero_aligned_c_avx FPFX(memzero_aligned_c_avx)
 void *xavs2_memzero_aligned_c_avx (void *dst, size_t n);
-#define xavs2_mem_repeat_i_c_sse2 FPFX(mem_repeat_i_c_sse2)
-void  xavs2_mem_repeat_i_c_sse2   (void *dst, int val, size_t count);
 #define xavs2_mem_repeat_i_c_avx FPFX(mem_repeat_i_c_avx)
 void  xavs2_mem_repeat_i_c_avx    (void *dst, int val, size_t count);
+#endif
+#define xavs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2)
+void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n);
+#define xavs2_mem_repeat_i_c_sse2 FPFX(mem_repeat_i_c_sse2)
+void  xavs2_mem_repeat_i_c_sse2   (void *dst, int val, size_t count);
 #define xavs2_memcpy_aligned_c_sse2 FPFX(memcpy_aligned_c_sse2)
 void *xavs2_memcpy_aligned_c_sse2 (void *dst, const void *src, size_t n);
 
@@ -196,6 +201,7 @@ void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int A
 void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, unsigned char *flt_flag);
 
 //--------avx2--------    add by zhangjiaqi    2016-12-02
+#if defined(__AVX2__)
 #define deblock_edge_hor_avx2 FPFX(deblock_edge_hor_avx2)
 void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag);
 #define deblock_edge_ver_avx2 FPFX(deblock_edge_ver_avx2)
@@ -204,32 +210,34 @@ void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8
 void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag);
 #define deblock_edge_ver_c_avx2 FPFX(deblock_edge_ver_c_avx2)
 void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag);
+#endif
 
 #define dct_c_4x4_sse128 FPFX(dct_c_4x4_sse128)
-void dct_c_4x4_sse128  (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_4x4_sse128  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_8x8_sse128 FPFX(dct_c_8x8_sse128)
-void dct_c_8x8_sse128  (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_8x8_sse128  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_16x16_sse128 FPFX(dct_c_16x16_sse128)
-void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_16x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_32x32_sse128 FPFX(dct_c_32x32_sse128)
-void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_32x32_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_64x64_sse128 FPFX(dct_c_64x64_sse128)
-void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 
 #define dct_c_4x16_sse128 FPFX(dct_c_4x16_sse128)
-void dct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_4x16_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_8x32_sse128 FPFX(dct_c_8x32_sse128)
-void dct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_8x32_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_16x4_sse128 FPFX(dct_c_16x4_sse128)
-void dct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_16x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_32x8_sse128 FPFX(dct_c_32x8_sse128)
-void dct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_32x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_64x16_sse128 FPFX(dct_c_64x16_sse128)
-void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_16x64_sse128 FPFX(dct_c_16x64_sse128)
-void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 
 //futl
+#if defined(__AVX2__)
 #define dct_c_4x4_avx2 FPFX(dct_c_4x4_avx2)
 void dct_c_4x4_avx2(const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_8x8_avx2 FPFX(dct_c_8x8_avx2)
@@ -255,54 +263,57 @@ void dct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_src);
 void dct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_16x64_avx2 FPFX(dct_c_16x64_avx2)
 void dct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_src);
+#endif
 
 /* half DCT, only keep low frequency coefficients */
 #define dct_c_32x32_half_sse128 FPFX(dct_c_32x32_half_sse128)
-void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_32x32_half_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_64x64_half_sse128 FPFX(dct_c_64x64_half_sse128)
-void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src);
+void dct_c_64x64_half_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
+#if defined(__AVX2__)
 #define dct_c_32x32_half_avx2 FPFX(dct_c_32x32_half_avx2)
 void dct_c_32x32_half_avx2(const coeff_t *src, coeff_t *dst, int i_src);
 #define dct_c_64x64_half_avx2 FPFX(dct_c_64x64_half_avx2)
 void dct_c_64x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_src);
+#endif
 
 #define transform_4x4_2nd_sse128 FPFX(transform_4x4_2nd_sse128)
-void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff);
+void transform_4x4_2nd_sse128(xavs2_t *h, coeff_t *coeff, int i_coeff);
 #define transform_2nd_sse128 FPFX(transform_2nd_sse128)
 void transform_2nd_sse128    (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left);
 
 #define idct_c_4x4_sse128 FPFX(idct_c_4x4_sse128)
-void idct_c_4x4_sse128  (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_4x4_sse128  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_8x8_sse128 FPFX(idct_c_8x8_sse128)
-void idct_c_8x8_sse128  (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_8x8_sse128  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_16x16_sse128 FPFX(idct_c_16x16_sse128)
-void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_16x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_32x32_sse128 FPFX(idct_c_32x32_sse128)
-void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_32x32_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_64x64_sse128 FPFX(idct_c_64x64_sse128)
-void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 
 #define idct_c_16x4_sse128 FPFX(idct_c_16x4_sse128)
-void idct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_16x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_32x8_sse128 FPFX(idct_c_32x8_sse128)
-void idct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_32x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_64x16_sse128 FPFX(idct_c_64x16_sse128)
-void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 
 #define idct_c_4x16_sse128 FPFX(idct_c_4x16_sse128)
-void idct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_4x16_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_8x32_sse128 FPFX(idct_c_8x32_sse128)
-void idct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_8x32_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_16x64_sse128 FPFX(idct_c_16x64_sse128)
-void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst);
+void idct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 
 #define inv_transform_4x4_2nd_sse128 FPFX(inv_transform_4x4_2nd_sse128)
-void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff);
+void inv_transform_4x4_2nd_sse128(xavs2_t *h, coeff_t *coeff, int i_coeff);
 #define inv_transform_2nd_sse128 FPFX(inv_transform_2nd_sse128)
 void inv_transform_2nd_sse128    (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left);
 
-
 //zhangjiaqi add 2016.11.30    avx2
+#if defined(__AVX2__)
 #define idct_c_8x8_avx2 FPFX(idct_c_8x8_avx2)
 void idct_c_8x8_avx2  (const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_16x16_avx2 FPFX(idct_c_16x16_avx2)
@@ -315,6 +326,7 @@ void idct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst);
 void idct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst);
 #define idct_c_16x64_avx2 FPFX(idct_c_16x64_avx2)
 void idct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst);
+#endif
 
 // scan the cg coefficient
 #define coeff_scan_4x4_xy_sse128 FPFX(coeff_scan_4x4_xy_sse128)
@@ -326,111 +338,153 @@ void coeff_scan_4x4_yx_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift)
 void coeff_scan4_xy_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4);
 #define coeff_scan4_yx_sse128 FPFX(coeff_scan4_yx_sse128)
 void coeff_scan4_yx_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4);
+#if defined(__AVX2__)
 #define coeff_scan4_xy_avx FPFX(coeff_scan4_xy_avx)
 void coeff_scan4_xy_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4);
 #define coeff_scan4_yx_avx FPFX(coeff_scan4_yx_avx)
 void coeff_scan4_yx_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4);
-
+#endif
 #define abs_coeff_sse128 FPFX(abs_coeff_sse128)
 void abs_coeff_sse128(coeff_t *dst, const coeff_t *src, const int i_coef);
 #define add_sign_sse128 FPFX(add_sign_sse128)
 int add_sign_sse128(coeff_t *dst, const coeff_t *abs_val, const int i_coef);
 
-#define quant_c_avx2 FPFX(quant_c_avx2)
-int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add);
-#define dequant_c_avx2 FPFX(dequant_c_avx2)
-void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift);
 #define quant_c_sse128 FPFX(quant_c_avx2)
 int quant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add);
 #define dequant_c_sse128 FPFX(dequant_c_sse128)
 void dequant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add);
+#if defined(__AVX2__)
+#define quant_c_avx2 FPFX(quant_c_avx2)
+int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add);
+#define dequant_c_avx2 FPFX(dequant_c_avx2)
+void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift);
 #define abs_coeff_avx2 FPFX(abs_coeff_avx2)
 void abs_coeff_avx2(coeff_t *dst, const coeff_t *src, const int i_coef);
 #define add_sign_avx2 FPFX(add_sign_avx2)
 int add_sign_avx2(coeff_t *dst, const coeff_t *abs_val, const int i_coef);
-
-#define SAO_on_block_sse128 FPFX(SAO_on_block_sse128)
-void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src,
-                         int i_src, int i_block_w, int i_block_h,
-                         int *lcu_avail, SAOBlkParam *sao_param);
 #define SAO_on_block_sse256 FPFX(SAO_on_block_sse256)
 void SAO_on_block_sse256(pel_t *p_dst, int i_dst, pel_t *p_src,
                          int i_src,int i_block_w, int i_block_h,
                          int *lcu_avail, SAOBlkParam *sao_param);
+#endif
+
+#if !HIGH_BIT_DEPTH
+#define SAO_on_block_sse128 FPFX(SAO_on_block_sse128)
+void SAO_on_block_sse128(xavs2_t *h, pel_t *p_dst, int i_dst, pel_t *p_src,
+                         int i_src, int i_block_w, int i_block_h,
+                         int *lcu_avail, SAOBlkParam *sao_param);
+#else
+#define SAO_on_block_bo_sse128 FPFX(SAO_on_block_bo_sse128)
+void SAO_on_block_bo_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const SAOBlkParam* saoBlkParam);
+#define SAO_on_block_eo_0_sse128 FPFX(SAO_on_block_eo_0_sse128)
+void SAO_on_block_eo_0_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset);
+#define SAO_on_block_eo_45_sse128 FPFX(SAO_on_block_eo_45_sse128)
+void SAO_on_block_eo_45_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset);
+#define SAO_on_block_eo_90_sse128 FPFX(SAO_on_block_eo_90_sse128)
+void SAO_on_block_eo_90_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset);
+#define SAO_on_block_eo_135_sse128 FPFX(SAO_on_block_eo_135_sse128)
+void SAO_on_block_eo_135_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset);
+#endif
 
 #define alf_flt_one_block_sse128 FPFX(alf_flt_one_block_sse128)
-void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+#if !HIGH_BIT_DEPTH
+void alf_flt_one_block_sse128(xavs2_t *h, pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                               int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
                               int *alf_coeff, int b_top_avail, int b_down_avail);
+#else
+void alf_flt_one_block_sse128(xavs2_t *h, pel_t* p_dst, const pel_t* p_src, int stride,
+    int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+    int* alf_coeff, int b_top_avail, int b_down_avail);
+#endif
+#if defined(__AVX2__)
+#define alf_filter_block_avx2_10bit FPFX(alf_filter_block_avx2_10bit)
+void alf_filter_block_avx2(xavs2_t *h, pel_t* p_dst, const pel_t* p_src, int stride,
+    int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+    int* alf_coeff, int b_top_avail, int b_down_avail);
+#endif
 
+#if !HIGH_BIT_DEPTH
 #define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128)
-void intra_pred_dc_sse128       (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_dc_sse128       (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128)
-void intra_pred_plane_sse128    (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_plane_sse128    (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128)
-void intra_pred_bilinear_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_bilinear_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128)
-void intra_pred_hor_sse128      (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_hor_sse128      (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128)
-void intra_pred_ver_sse128      (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ver_sse128      (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+#else
+#define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128)
+void intra_pred_dc_sse128       (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight);
+#define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128)
+void intra_pred_plane_sse128    (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight);
+#define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128)
+void intra_pred_bilinear_sse128 (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight);
+#define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128)
+void intra_pred_hor_sse128      (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight);
+#define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128)
+void intra_pred_ver_sse128      (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+#endif
 
 #define intra_pred_ang_x_3_sse128 FPFX(intra_pred_ang_x_3_sse128)
-void intra_pred_ang_x_3_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_3_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_4_sse128 FPFX(intra_pred_ang_x_4_sse128)
-void intra_pred_ang_x_4_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_4_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_5_sse128 FPFX(intra_pred_ang_x_5_sse128)
-void intra_pred_ang_x_5_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_5_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_6_sse128 FPFX(intra_pred_ang_x_6_sse128)
-void intra_pred_ang_x_6_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_6_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_7_sse128 FPFX(intra_pred_ang_x_7_sse128)
-void intra_pred_ang_x_7_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_7_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_8_sse128 FPFX(intra_pred_ang_x_8_sse128)
-void intra_pred_ang_x_8_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_8_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_9_sse128 FPFX(intra_pred_ang_x_9_sse128)
-void intra_pred_ang_x_9_sse128  (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_9_sse128  (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_10_sse128 FPFX(intra_pred_ang_x_10_sse128)
-void intra_pred_ang_x_10_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_10_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_x_11_sse128 FPFX(intra_pred_ang_x_11_sse128)
-void intra_pred_ang_x_11_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_x_11_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 
 #define intra_pred_ang_y_25_sse128 FPFX(intra_pred_ang_y_25_sse128)
-void intra_pred_ang_y_25_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_25_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_26_sse128 FPFX(intra_pred_ang_y_26_sse128)
-void intra_pred_ang_y_26_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_26_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_28_sse128 FPFX(intra_pred_ang_y_28_sse128)
-void intra_pred_ang_y_28_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_28_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_30_sse128 FPFX(intra_pred_ang_y_30_sse128)
-void intra_pred_ang_y_30_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_30_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_31_sse128 FPFX(intra_pred_ang_y_31_sse128)
-void intra_pred_ang_y_31_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_31_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_32_sse128 FPFX(intra_pred_ang_y_32_sse128)
-void intra_pred_ang_y_32_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_y_32_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 
 #define intra_pred_ang_xy_13_sse128 FPFX(intra_pred_ang_xy_13_sse128)
-void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_13_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_14_sse128 FPFX(intra_pred_ang_xy_14_sse128)
-void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_14_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_16_sse128 FPFX(intra_pred_ang_xy_16_sse128)
-void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_16_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_18_sse128 FPFX(intra_pred_ang_xy_18_sse128)
-void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_18_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_20_sse128 FPFX(intra_pred_ang_xy_20_sse128)
-void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_20_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_22_sse128 FPFX(intra_pred_ang_xy_22_sse128)
-void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_22_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_xy_23_sse128 FPFX(intra_pred_ang_xy_23_sse128)
-void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
+void intra_pred_ang_xy_23_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 
 #define fill_edge_samples_0_sse128 FPFX(fill_edge_samples_0_sse128)
-void fill_edge_samples_0_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
+void fill_edge_samples_0_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
 #define fill_edge_samples_x_sse128 FPFX(fill_edge_samples_x_sse128)
-void fill_edge_samples_x_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
+void fill_edge_samples_x_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
 #define fill_edge_samples_y_sse128 FPFX(fill_edge_samples_y_sse128)
-void fill_edge_samples_y_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
+void fill_edge_samples_y_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
 #define fill_edge_samples_xy_sse128 FPFX(fill_edge_samples_xy_sse128)
-void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
+void fill_edge_samples_xy_sse128(xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy);
 
 //intra prediction avx functions
+#if defined(__AVX2__)
 #define intra_pred_ver_avx FPFX(intra_pred_ver_avx)
 void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_hor_avx FPFX(intra_pred_hor_avx)
@@ -487,7 +541,7 @@ void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in
 void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
 #define intra_pred_ang_y_32_avx FPFX(intra_pred_ang_y_32_avx)
 void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy);
-
+#endif
 
 
 #define mad_16x16_sse128 FPFX(mad_16x16_sse128)
diff --git a/source/common/vec/intrinsic_alf.c b/source/common/vec/intrinsic_alf.c
index 00bc61c..88a4100 100644
--- a/source/common/vec/intrinsic_alf.c
+++ b/source/common/vec/intrinsic_alf.c
@@ -44,7 +44,9 @@
 #include <smmintrin.h>
 
 
-void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+#if !HIGH_BIT_DEPTH
+void alf_flt_one_block_sse128(xavs2_t *h,
+                              pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
                               int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
                               int *alf_coeff, int b_top_avail, int b_down_avail)
 {
@@ -58,7 +60,7 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
     __m128i mSwitch1, mSwitch2, mSwitch3, mSwitch4, mSwitch5;
     __m128i mAddOffset;
     __m128i mZero = _mm_set1_epi16(0);
-    __m128i mMax = _mm_set1_epi16((short)(max_pel_value));
+    __m128i mMax = _mm_set1_epi16((short)((1 << h->param->input_sample_bit_depth) - 1));
     __m128i mask;
 
     int i, j;
@@ -113,15 +115,15 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             T01 = _mm_loadu_si128((__m128i*)&p_src5[j]);
             E00 = _mm_unpacklo_epi8(T00, T01);
             E01 = _mm_unpackhi_epi8(T00, T01);
-            S00 = _mm_maddubs_epi16(E00, C0);//Ç°8¸öÏñËØËùÓÐC0*P0µÄ½á¹û
-            S01 = _mm_maddubs_epi16(E01, C0);//ºó8¸öÏñËØËùÓÐC0*P0µÄ½á¹û
+            S00 = _mm_maddubs_epi16(E00, C0);//å‰8ä¸ªåƒç´ æ‰€æœ‰C0*P0çš„ç»“æžœ
+            S01 = _mm_maddubs_epi16(E01, C0);//åŽ8ä¸ªåƒç´ æ‰€æœ‰C0*P0çš„ç»“æžœ
 
             T10 = _mm_loadu_si128((__m128i*)&p_src4[j]);
             T11 = _mm_loadu_si128((__m128i*)&p_src3[j]);
             E10 = _mm_unpacklo_epi8(T10, T11);
             E11 = _mm_unpackhi_epi8(T10, T11);
-            S10 = _mm_maddubs_epi16(E10, C1);//Ç°8¸öÏñËØËùÓÐC1*P1µÄ½á¹û
-            S11 = _mm_maddubs_epi16(E11, C1);//ºó8¸öÏñËØËùÓÐC1*P1µÄ½á¹û
+            S10 = _mm_maddubs_epi16(E10, C1);//å‰8ä¸ªåƒç´ æ‰€æœ‰C1*P1çš„ç»“æžœ
+            S11 = _mm_maddubs_epi16(E11, C1);//åŽ8ä¸ªåƒç´ æ‰€æœ‰C1*P1çš„ç»“æžœ
 
             T20 = _mm_loadu_si128((__m128i*)&p_src2[j - 1]);
             T21 = _mm_loadu_si128((__m128i*)&p_src1[j + 1]);
@@ -161,26 +163,26 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
             S8 = _mm_maddubs_epi16(T4, C33);
             S50 = _mm_hadds_epi16(S5, S6);
             S51 = _mm_hadds_epi16(S7, S8);
-            S5 = _mm_hadds_epi16(S50, S51);//Ç°8¸ö
+            S5 = _mm_hadds_epi16(S50, S51);//å‰8ä¸ª
             S4 = _mm_maddubs_epi16(T5, C33);
             S6 = _mm_maddubs_epi16(T6, C33);
             S7 = _mm_maddubs_epi16(T7, C33);
             S8 = _mm_maddubs_epi16(T8, C33);
             S60 = _mm_hadds_epi16(S4, S6);
             S61 = _mm_hadds_epi16(S7, S8);
-            S6 = _mm_hadds_epi16(S60, S61);//ºó8¸ö
+            S6 = _mm_hadds_epi16(S60, S61);//åŽ8ä¸ª
 
             S0 = _mm_adds_epi16(S00, S10);
             S1 = _mm_adds_epi16(S30, S20);
             S2 = _mm_adds_epi16(S40, S5);
             S3 = _mm_adds_epi16(S1, S0);
-            SS1 = _mm_adds_epi16(S2, S3);//Ç°8¸ö
+            SS1 = _mm_adds_epi16(S2, S3);//å‰8ä¸ª
 
             S0 = _mm_adds_epi16(S01, S11);
             S1 = _mm_adds_epi16(S31, S21);
             S2 = _mm_adds_epi16(S41, S6);
             S3 = _mm_adds_epi16(S1, S0);
-            SS2 = _mm_adds_epi16(S2, S3);//ºó8¸ö
+            SS2 = _mm_adds_epi16(S2, S3);//åŽ8ä¸ª
 
 
             SS1 = _mm_adds_epi16(SS1, mAddOffset);
@@ -206,5 +208,309 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
         p_dst += i_dst;
     }
 }
+#else
+/*****************************************************************************
+*  Copyright (C) 2016 uavs2dec project,
+*  National Engineering Laboratory for Video Technology(Shenzhen),
+*  Digital Media R&D Center at Peking University Shenzhen Graduate School, China
+*  Project Leader: Ronggang Wang <rgwang@pkusz.edu.cn>
+*
+*  Main Authors: Zhenyu Wang <wangzhenyu@pkusz.edu.cn>, Kui Fan <kuifan@pku.edu.cn>
+*               Shenghao Zhang <1219759986@qq.com>Â£Â¬ Bingjie Han, Kaili Yao, Hongbin Cao,  Yueming Wang,
+*               Jing Su, Jiaying Yan, Junru Li
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at rgwang@pkusz.edu.cn.
+*****************************************************************************/
+
+void alf_flt_one_block_sse128(xavs2_t *h,
+                              pel_t* p_dst, const pel_t* p_src, int stride,
+                              int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height,
+                              int* alf_coeff, int b_top_avail, int b_down_avail)
+{
+    const pel_t* p_src1, * p_src2, * p_src3, * p_src4, * p_src5, * p_src6;
+
+    __m128i T00, T01, T10, T11, T20, T21, T30, T31, T40, T41;
+    __m128i E00, E01, E10, E11, E20, E21, E30, E31, E40, E41;
+    __m128i C0, C1, C2, C3, C4, C5, C6, C7, C8;
+    __m128i S00, S01, S10, S11, S20, S21, S30, S31, S40, S41, S50, S51, S60, S61, SS1, SS2, S, S70, S71, S80, S81;
+    __m128i mAddOffset;
+    __m128i mask;
+    __m128i zero = _mm_setzero_si128();
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+    int i, j;
+    int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y;
+    int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height);
+    int lcu_pix_xEnd = lcu_pix_x + lcu_width;
+
+    p_src += (startPos * stride);
+    p_dst += (startPos * stride);
+
+    C0 = _mm_set1_epi16((pel_t)alf_coeff[0]);
+    C1 = _mm_set1_epi16((pel_t)alf_coeff[1]);
+    C2 = _mm_set1_epi16((pel_t)alf_coeff[2]);
+    C3 = _mm_set1_epi16((pel_t)alf_coeff[3]);
+    C4 = _mm_set1_epi16((pel_t)alf_coeff[4]);
+    C5 = _mm_set1_epi16((pel_t)alf_coeff[5]);
+    C6 = _mm_set1_epi16((pel_t)alf_coeff[6]);
+    C7 = _mm_set1_epi16((pel_t)alf_coeff[7]);
+    C8 = _mm_set1_epi16((pel_t)alf_coeff[8]);
+
+    mAddOffset = _mm_set1_epi32(32);
+
+    if (lcu_width & 7) {
+        int lcu_pix_xEnd8 = lcu_pix_xEnd - (lcu_width & 0x07);
+        mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(lcu_width & 7) - 1]));
+        for (i = startPos; i < endPos; i++) {
+            int yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 1);
+            int yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 1);
+            p_src1 = p_src + (yBottom - i) * stride;
+            p_src2 = p_src + (yUp - i) * stride;
+
+            yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 2);
+            yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 2);
+            p_src3 = p_src + (yBottom - i) * stride;
+            p_src4 = p_src + (yUp - i) * stride;
+
+            yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 3);
+            yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 3);
+            p_src5 = p_src + (yBottom - i) * stride;
+            p_src6 = p_src + (yUp - i) * stride;
+
+            for (j = lcu_pix_x; j < lcu_pix_xEnd; j += 8) {
+                T00 = _mm_loadu_si128((__m128i*) & p_src6[j]);
+                T01 = _mm_loadu_si128((__m128i*) & p_src5[j]);
+                E00 = _mm_unpacklo_epi16(T00, T01);
+                E01 = _mm_unpackhi_epi16(T00, T01);
+                S00 = _mm_madd_epi16(E00, C0);
+                S01 = _mm_madd_epi16(E01, C0);
+
+                T10 = _mm_loadu_si128((__m128i*) & p_src4[j]);
+                T11 = _mm_loadu_si128((__m128i*) & p_src3[j]);
+                E10 = _mm_unpacklo_epi16(T10, T11);
+                E11 = _mm_unpackhi_epi16(T10, T11);
+                S10 = _mm_madd_epi16(E10, C1);
+                S11 = _mm_madd_epi16(E11, C1);
+
+                T20 = _mm_loadu_si128((__m128i*) & p_src2[j - 1]);
+                T21 = _mm_loadu_si128((__m128i*) & p_src1[j + 1]);
+                E20 = _mm_unpacklo_epi16(T20, T21);
+                E21 = _mm_unpackhi_epi16(T20, T21);
+                S20 = _mm_madd_epi16(E20, C2);
+                S21 = _mm_madd_epi16(E21, C2);
+
+                T30 = _mm_loadu_si128((__m128i*) & p_src2[j]);
+                T31 = _mm_loadu_si128((__m128i*) & p_src1[j]);
+                E30 = _mm_unpacklo_epi16(T30, T31);
+                E31 = _mm_unpackhi_epi16(T30, T31);
+                S30 = _mm_madd_epi16(E30, C3);
+                S31 = _mm_madd_epi16(E31, C3);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src2[j + 1]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src1[j - 1]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S40 = _mm_madd_epi16(E40, C4);
+                S41 = _mm_madd_epi16(E41, C4);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 3]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 3]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S50 = _mm_madd_epi16(E40, C5);
+                S51 = _mm_madd_epi16(E41, C5);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 2]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 2]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S60 = _mm_madd_epi16(E40, C6);
+                S61 = _mm_madd_epi16(E41, C6);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 1]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 1]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S70 = _mm_madd_epi16(E40, C7);
+                S71 = _mm_madd_epi16(E41, C7);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j]);
+                E40 = _mm_unpacklo_epi16(T40, zero);
+                E41 = _mm_unpackhi_epi16(T40, zero);
+                S80 = _mm_madd_epi16(E40, C8);
+                S81 = _mm_madd_epi16(E41, C8);
+
+                SS1 = _mm_add_epi32(S00, S10);
+                SS1 = _mm_add_epi32(SS1, S20);
+                SS1 = _mm_add_epi32(SS1, S30);
+                SS1 = _mm_add_epi32(SS1, S40);
+                SS1 = _mm_add_epi32(SS1, S50);
+                SS1 = _mm_add_epi32(SS1, S60);
+                SS1 = _mm_add_epi32(SS1, S70);
+                SS1 = _mm_add_epi32(SS1, S80);
+
+                SS2 = _mm_add_epi32(S01, S11);
+                SS2 = _mm_add_epi32(SS2, S21);
+                SS2 = _mm_add_epi32(SS2, S31);
+                SS2 = _mm_add_epi32(SS2, S41);
+                SS2 = _mm_add_epi32(SS2, S51);
+                SS2 = _mm_add_epi32(SS2, S61);
+                SS2 = _mm_add_epi32(SS2, S71);
+                SS2 = _mm_add_epi32(SS2, S81);
+
+                SS1 = _mm_add_epi32(SS1, mAddOffset);
+                SS1 = _mm_srai_epi32(SS1, 6);
+
+                SS2 = _mm_add_epi32(SS2, mAddOffset);
+                SS2 = _mm_srai_epi32(SS2, 6);
+
+                S = _mm_packus_epi32(SS1, SS2);
+                S = _mm_min_epu16(S, max_val);
+                if (j != lcu_pix_xEnd8) {
+                    _mm_storeu_si128((__m128i*)(p_dst + j), S);
+                }
+                else {
+                    _mm_maskmoveu_si128(S, mask, (char*)(p_dst + j));
+                    break;
+                }
+            }
+
+            p_src += stride;
+            p_dst += stride;
+        }
+    }
+    else {
+        for (i = startPos; i < endPos; i++) {
+            int yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 1);
+            int yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 1);
+            p_src1 = p_src + (yBottom - i) * stride;
+            p_src2 = p_src + (yUp - i) * stride;
+
+            yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 2);
+            yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 2);
+            p_src3 = p_src + (yBottom - i) * stride;
+            p_src4 = p_src + (yUp - i) * stride;
+
+            yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 3);
+            yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 3);
+            p_src5 = p_src + (yBottom - i) * stride;
+            p_src6 = p_src + (yUp - i) * stride;
+
+            for (j = lcu_pix_x; j < lcu_pix_xEnd; j += 8) {
+                T00 = _mm_loadu_si128((__m128i*) & p_src6[j]);
+                T01 = _mm_loadu_si128((__m128i*) & p_src5[j]);
+                E00 = _mm_unpacklo_epi16(T00, T01);
+                E01 = _mm_unpackhi_epi16(T00, T01);
+                S00 = _mm_madd_epi16(E00, C0);
+                S01 = _mm_madd_epi16(E01, C0);
+
+                T10 = _mm_loadu_si128((__m128i*) & p_src4[j]);
+                T11 = _mm_loadu_si128((__m128i*) & p_src3[j]);
+                E10 = _mm_unpacklo_epi16(T10, T11);
+                E11 = _mm_unpackhi_epi16(T10, T11);
+                S10 = _mm_madd_epi16(E10, C1);
+                S11 = _mm_madd_epi16(E11, C1);
+
+                T20 = _mm_loadu_si128((__m128i*) & p_src2[j - 1]);
+                T21 = _mm_loadu_si128((__m128i*) & p_src1[j + 1]);
+                E20 = _mm_unpacklo_epi16(T20, T21);
+                E21 = _mm_unpackhi_epi16(T20, T21);
+                S20 = _mm_madd_epi16(E20, C2);
+                S21 = _mm_madd_epi16(E21, C2);
+
+                T30 = _mm_loadu_si128((__m128i*) & p_src2[j]);
+                T31 = _mm_loadu_si128((__m128i*) & p_src1[j]);
+                E30 = _mm_unpacklo_epi16(T30, T31);
+                E31 = _mm_unpackhi_epi16(T30, T31);
+                S30 = _mm_madd_epi16(E30, C3);
+                S31 = _mm_madd_epi16(E31, C3);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src2[j + 1]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src1[j - 1]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S40 = _mm_madd_epi16(E40, C4);
+                S41 = _mm_madd_epi16(E41, C4);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 3]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 3]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S50 = _mm_madd_epi16(E40, C5);
+                S51 = _mm_madd_epi16(E41, C5);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 2]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 2]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S60 = _mm_madd_epi16(E40, C6);
+                S61 = _mm_madd_epi16(E41, C6);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j - 1]);
+                T41 = _mm_loadu_si128((__m128i*) & p_src[j + 1]);
+                E40 = _mm_unpacklo_epi16(T40, T41);
+                E41 = _mm_unpackhi_epi16(T40, T41);
+                S70 = _mm_madd_epi16(E40, C7);
+                S71 = _mm_madd_epi16(E41, C7);
+
+                T40 = _mm_loadu_si128((__m128i*) & p_src[j]);
+                E40 = _mm_unpacklo_epi16(T40, zero);
+                E41 = _mm_unpackhi_epi16(T40, zero);
+                S80 = _mm_madd_epi16(E40, C8);
+                S81 = _mm_madd_epi16(E41, C8);
+
+                SS1 = _mm_add_epi32(S00, S10);
+                SS1 = _mm_add_epi32(SS1, S20);
+                SS1 = _mm_add_epi32(SS1, S30);
+                SS1 = _mm_add_epi32(SS1, S40);
+                SS1 = _mm_add_epi32(SS1, S50);
+                SS1 = _mm_add_epi32(SS1, S60);
+                SS1 = _mm_add_epi32(SS1, S70);
+                SS1 = _mm_add_epi32(SS1, S80);
+
+                SS2 = _mm_add_epi32(S01, S11);
+                SS2 = _mm_add_epi32(SS2, S21);
+                SS2 = _mm_add_epi32(SS2, S31);
+                SS2 = _mm_add_epi32(SS2, S41);
+                SS2 = _mm_add_epi32(SS2, S51);
+                SS2 = _mm_add_epi32(SS2, S61);
+                SS2 = _mm_add_epi32(SS2, S71);
+                SS2 = _mm_add_epi32(SS2, S81);
+
+                SS1 = _mm_add_epi32(SS1, mAddOffset);
+                SS1 = _mm_srai_epi32(SS1, 6);
+
+                SS2 = _mm_add_epi32(SS2, mAddOffset);
+                SS2 = _mm_srai_epi32(SS2, 6);
+
+                S = _mm_packus_epi32(SS1, SS2);
+                S = _mm_min_epu16(S, max_val);
+
+                _mm_storeu_si128((__m128i*)(p_dst + j), S);
+
+            }
+
+            p_src += stride;
+            p_dst += stride;
+        }
+    }
+}
+#endif
 
 
diff --git a/source/common/vec/intrinsic_dct.c b/source/common/vec/intrinsic_dct.c
index f5126a3..dfd3ff1 100644
--- a/source/common/vec/intrinsic_dct.c
+++ b/source/common/vec/intrinsic_dct.c
@@ -42,6 +42,7 @@
 
 #include "../basic_types.h"
 #include "../avs2_defs.h"
+#include "../common.h"
 #include "intrinsic.h"
 
 void *xavs2_fast_memzero_mmx(void *dst, size_t n);
@@ -396,9 +397,10 @@ ALIGN16(static const int16_t g_2TC_V[8 * (2 * SEC_TR_SIZE)]) = {
 
 /* ---------------------------------------------------------------------------
 futl change 2016.12.19*/
-void dct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_4x4_sse128(xavs2_t *h,
+                      const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << SHIFT1) >> 1;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -484,9 +486,10 @@ void dct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
 futl change 2016.12.19*/
-void dct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_8x8_sse128(xavs2_t *h,
+                      const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int SHIFT1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    const int SHIFT1 = B8X8_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     const int SHIFT2 = B8X8_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << SHIFT1) >> 1;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -685,9 +688,10 @@ void dct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_16x4_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    const int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     const int shift2 = B16X16_IN_BIT + FACTO_BIT - 2;
     const int ADD1 = (1 << shift1) >> 1;
     const int ADD2 = (1 << shift2) >> 1;
@@ -973,9 +977,10 @@ void dct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_4x16_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2;
+    const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2;
     const int ADD1 = (1 << SHIFT1) >> 1;
     const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -1037,7 +1042,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     u11 = _mm_madd_epi16(t2, k_p17_p42);
     u13 = _mm_madd_epi16(t2, k_m42_p17);
 
-    //ÒÆÎ»²¹³¥
+    //Ã’Ã†ÃŽÂ»Â²Â¹Â³Â¥
     u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1);
     u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1);
     u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1);
@@ -1061,7 +1066,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     u21 = _mm_madd_epi16(t2, k_p17_p42);
     u23 = _mm_madd_epi16(t2, k_m42_p17);
 
-    //ÒÆÎ»²¹³¥
+    //Ã’Ã†ÃŽÂ»Â²Â¹Â³Â¥
     u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1);
     u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1);
     u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1);
@@ -1089,7 +1094,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     u11 = _mm_madd_epi16(t2, k_p17_p42);
     u13 = _mm_madd_epi16(t2, k_m42_p17);
 
-    //ÒÆÎ»²¹³¥
+    //Ã’Ã†ÃŽÂ»Â²Â¹Â³Â¥
     u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1);
     u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1);
     u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1);
@@ -1111,7 +1116,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     u22 = _mm_madd_epi16(t0, k_p32_m32);
     u21 = _mm_madd_epi16(t2, k_p17_p42);
     u23 = _mm_madd_epi16(t2, k_m42_p17);
-    //ÒÆÎ»²¹³¥
+    //Ã’Ã†ÃŽÂ»Â²Â¹Â³Â¥
     u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1);
     u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1);
     u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1);
@@ -1343,9 +1348,10 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_16x16_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << SHIFT1) >> 1;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -1765,10 +1771,11 @@ void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_8x32_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_src)
 {
     int i;
-    int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01);
+    int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01);
     int shift2 = B32X32_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << shift1) >> 1;
     const int ADD2 = (1 << shift2) >> 1;
@@ -1853,7 +1860,7 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     I5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
     I6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
     I7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- 
+
         TRANSPOSE_8x8(in0, in1, in2, in3, in4, in5, in6, in7)
 #undef TRANSPOSE_8x8
 
@@ -2131,7 +2138,7 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     \
     tResult = _mm_packs_epi32(T60, TT60); \
     _mm_storeu_si128((__m128i*)&dst[(dstPos)* 8], tResult); \
- 
+
     MAKE_ODD(44, 44, 44, 44, 0);
     MAKE_ODD(45, 45, 45, 45, 16);
     MAKE_ODD(46, 47, 46, 47, 8);
@@ -2173,10 +2180,11 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_32x8_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_src)
 {
     int i;
-    int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT;
+    int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT;
     int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01);
     const int ADD1 = (1 << shift1) >> 1;
     const int ADD2 = (1 << shift2) >> 1;
@@ -2535,9 +2543,10 @@ void dct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_32x32_half_sse128(xavs2_t *h,
+                             const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
+    const int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
     const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << shift1) >> 1;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -2809,7 +2818,7 @@ void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src)
     /* clear result buffer */
     xavs2_fast_memzero_mmx(dst, 32 * 32 * sizeof(coeff_t));
 
-    // DCT2, Ö»±£ÁôÇ°16ÐÐºÍÇ°16ÁÐ
+    // DCT2, Ã–Â»Â±Â£ÃÃ´Ã‡Â°16ÃÃÂºÃÃ‡Â°16ÃÃ
     for (i = 0; i < 16 / 4; i++) {
         // OPT_ME: to avoid register spill, I use matrix multiply, have other way?
         T00A = im[i * 4 + 0][0];    // [07 06 05 04 03 02 01 00]
@@ -2920,9 +2929,10 @@ void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src)
 }
 
 //optimize 32x32 size transform
-void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_32x32_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_src)
 {
-    const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
+    const int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01);
     const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT;
     const int ADD1 = (1 << shift1) >> 1;
     const int ADD2 = (1 << SHIFT2) >> 1;
@@ -2990,7 +3000,7 @@ void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
         T07D = _mm_load_si128((__m128i*)(src + 24));
         src += i_src;
 
-        //_mm_load_si128((__m128i)tab_dct_16_0[1]) »»³É *((__m128i*)tab_dct_16_0[1])
+        //_mm_load_si128((__m128i)tab_dct_16_0[1]) Â»Â»Â³Ã‰ *((__m128i*)tab_dct_16_0[1])
         T00A = _mm_shuffle_epi8(T00A, *((__m128i*)tab_dct_16_0[1]));   // [05 02 06 01 04 03 07 00]
         T00B = _mm_shuffle_epi8(T00B, *((__m128i*)tab_dct_32_0[0]));   // [10 13 09 14 11 12 08 15]
         T00C = _mm_shuffle_epi8(T00C, *((__m128i*)tab_dct_16_0[1]));   // [21 18 22 17 20 19 23 16]
@@ -3087,7 +3097,7 @@ void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src)
         T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1);
         T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1);
         T60 = _mm_packs_epi32(T50, T51);
-        im[0][i] = T60;//16¸ö0µ½8ÐÐ¼ÆËã³öÀ´µÄ±ä»»ÏµÊý(16 bit per bit width)
+        im[0][i] = T60;//16Â¸Ã¶0ÂµÂ½8ÃÃÂ¼Ã†Ã‹Ã£Â³Ã¶Ã€Â´ÂµÃ„Â±Ã¤Â»Â»ÃÂµÃŠÃ½(16 bit per bit width)
 
         T50 = _mm_hsub_epi32(T40, T41);
         T51 = _mm_hsub_epi32(T42, T43);
@@ -3890,9 +3900,10 @@ void transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top, in
 
 /* ---------------------------------------------------------------------------
  */
-void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff)
+void transform_4x4_2nd_sse128(xavs2_t *h,
+                              coeff_t *coeff, int i_coeff)
 {
-    const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1;
+    const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + 1;
     const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT + 1;
     const int ADD1 = 1 << (SHIFT1 - 1);
     const int ADD2 = 1 << (SHIFT2 - 1);
@@ -3998,7 +4009,7 @@ void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff)
 
 
 
-// transpose 8x8 & transpose 16x16(¾ØÕó×ªÖÃ)
+// transpose 8x8 & transpose 16x16(Â¾Ã˜Ã•Ã³Ã—ÂªÃ–Ãƒ)
 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
     tr0_0 = _mm_unpacklo_epi16(I0, I1); \
     tr0_1 = _mm_unpacklo_epi16(I2, I3); \
@@ -4024,22 +4035,22 @@ void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff)
     O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
     O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
     O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- 
+
 #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \
         TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \
         TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \
         TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \
         TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \
- 
+
 void wavelet_16x64_sse128(coeff_t *coeff)
 {
-    //ï¿½ï¿½ï¿½ï¿½ 16*64
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 16*64
     __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2];
 
-    //ï¿½ï¿½ï¿½ï¿½ 64*16
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 64*16
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8];
 
-    //ï¿½ï¿½Ê±
+    //Ã¯Â¿Â½Ã¯Â¿Â½ÃŠÂ±
     __m128i B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15, B16, B17, B18, B19, B20, B21, B22, B23, B24, B25, B26, B27, B28, B29, B30, B31;
     __m128i B32, B33, B34, B35, B36, B37, B38, B39, B40, B41, B42, B43, B44, B45, B46, B47, B48, B49, B50, B51, B52, B53, B54, B55, B56, B57, B58, B59, B60, B61, B62, B63;
 
@@ -4166,7 +4177,7 @@ void wavelet_16x64_sse128(coeff_t *coeff)
     }
 
     /* step 2: vertical transform */
-    /* copy ×ªï¿½ï¿½*/
+    /* copy Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½*/
     TRANSPOSE_8x8_16BIT(T00[0], T02[0], T04[0], T06[0], T08[0], T10[0], T12[0], T14[0], B00, B01, B02, B03, B04, B05, B06, B07);
     TRANSPOSE_8x8_16BIT(T00[1], T02[1], T04[1], T06[1], T08[1], T10[1], T12[1], T14[1], B08, B09, B10, B11, B12, B13, B14, B15);
     TRANSPOSE_8x8_16BIT(T00[2], T02[2], T04[2], T06[2], T08[2], T10[2], T12[2], T14[2], B16, B17, B18, B19, B20, B21, B22, B23);
@@ -4290,16 +4301,16 @@ void wavelet_16x64_sse128(coeff_t *coeff)
 
 void wavelet_64x16_sse128(coeff_t *coeff)
 {
-    //ï¿½ï¿½ï¿½ï¿½ 16*64
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 16*64
     __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2];
 
-    //ï¿½ï¿½ï¿½ï¿½ 64*16
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 64*16
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8];
 
-    //ï¿½ï¿½Ê± 64*16
+    //Ã¯Â¿Â½Ã¯Â¿Â½ÃŠÂ± 64*16
     __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4];
 
-    //ÁÙÊ±
+    //ÃÃ™ÃŠÂ±
     __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
     __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
     int i;
@@ -4497,7 +4508,7 @@ void wavelet_64x16_sse128(coeff_t *coeff)
     V62[1] = _mm_add_epi16(V62[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V61[1], V63[1]), mAddOffset2), 2));
 
     /* step 2: vertical transform */
-    //×ªï¿½ï¿½
+    //Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_8x8_16BIT(V00[0], V02[0], V04[0], V06[0], V08[0], V10[0], V12[0], V14[0], A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0]);
     TRANSPOSE_8x8_16BIT(V16[0], V18[0], V20[0], V22[0], V24[0], V26[0], V28[0], V30[0], A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1]);
     TRANSPOSE_8x8_16BIT(V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2]);
@@ -4547,13 +4558,13 @@ void wavelet_64x16_sse128(coeff_t *coeff)
 
 void wavelet_64x64_sse128(coeff_t *coeff)
 {
-    //ï¿½ï¿½ï¿½ï¿½ 16*64
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 16*64
     __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8];
 
-    //ï¿½ï¿½ï¿½ï¿½ 64*64
+    //Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 64*64
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8];
 
-    //ÁÙÊ± 32*64
+    //ÃÃ™ÃŠÂ± 32*64
     __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4], A16[4], A17[4], A18[4], A19[4], A20[4], A21[4], A22[4], A23[4], A24[4], A25[4], A26[4], A27[4], A28[4], A29[4], A30[4], A31[4], A32[4], A33[4], A34[4], A35[4], A36[4], A37[4], A38[4], A39[4], A40[4], A41[4], A42[4], A43[4], A44[4], A45[4], A46[4], A47[4], A48[4], A49[4], A50[4], A51[4], A52[4], A53[4], A54[4], A55[4], A56[4], A57[4], A58[4], A59[4], A60[4], A61[4], A62[4], A63[4];
 
     __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
@@ -4636,7 +4647,7 @@ void wavelet_64x64_sse128(coeff_t *coeff)
         T62[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 62]);
         T63[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 63]);
     }
-    //0-15ï¿½ï¿½×ªï¿½ï¿½
+    //0-15Ã¯Â¿Â½Ã¯Â¿Â½Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_16x16_16BIT(
         T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0],
         T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1],
@@ -4661,7 +4672,7 @@ void wavelet_64x64_sse128(coeff_t *coeff)
         V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0],
         V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1]
     );
-    //16-31ï¿½ï¿½×ªï¿½ï¿½
+    //16-31Ã¯Â¿Â½Ã¯Â¿Â½Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_16x16_16BIT(
         T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0],
         T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1],
@@ -4686,7 +4697,7 @@ void wavelet_64x64_sse128(coeff_t *coeff)
         V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2],
         V48[3], V49[3], V50[3], V51[3], V52[3], V53[3], V54[3], V55[3], V56[3], V57[3], V58[3], V59[3], V60[3], V61[3], V62[3], V63[3]
     );
-    //32-47ï¿½ï¿½×ªï¿½ï¿½
+    //32-47Ã¯Â¿Â½Ã¯Â¿Â½Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_16x16_16BIT(
         T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0],
         T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1],
@@ -4711,7 +4722,7 @@ void wavelet_64x64_sse128(coeff_t *coeff)
         V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4],
         V48[5], V49[5], V50[5], V51[5], V52[5], V53[5], V54[5], V55[5], V56[5], V57[5], V58[5], V59[5], V60[5], V61[5], V62[5], V63[5]
     );
-    //48-63ï¿½ï¿½×ªï¿½ï¿½
+    //48-63Ã¯Â¿Â½Ã¯Â¿Â½Ã—ÂªÃ¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_16x16_16BIT(
         T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0],
         T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1],
@@ -4840,7 +4851,7 @@ void wavelet_64x64_sse128(coeff_t *coeff)
         A48[0], A49[0], A50[0], A51[0], A52[0], A53[0], A54[0], A55[0], A56[0], A57[0], A58[0], A59[0], A60[0], A61[0], A62[0], A63[0],
         A48[1], A49[1], A50[1], A51[1], A52[1], A53[1], A54[1], A55[1], A56[1], A57[1], A58[1], A59[1], A60[1], A61[1], A62[1], A63[1]
     );
-    //16-31ï¿½ï¿½
+    //16-31Ã¯Â¿Â½Ã¯Â¿Â½
     TRANSPOSE_16x16_16BIT(
         V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], V48[0], V50[0], V52[0], V54[0], V56[0], V58[0], V60[0], V62[0],
         V32[1], V34[1], V36[1], V38[1], V40[1], V42[1], V44[1], V46[1], V48[1], V50[1], V52[1], V54[1], V56[1], V58[1], V60[1], V62[1],
@@ -4986,40 +4997,44 @@ void wavelet_64x64_sse128(coeff_t *coeff)
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_64x64_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(src);
     UNUSED_PARAMETER(i_src);
     wavelet_64x64_sse128(dst);
-    dct_c_32x32_sse128(dst, dst, 32 | 1);
+    dct_c_32x32_sse128(h, dst, dst, 32 | 1);
 }
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_64x64_half_sse128(xavs2_t *h,
+                             const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(src);
     UNUSED_PARAMETER(i_src);
     wavelet_64x64_sse128(dst);
-    dct_c_32x32_half_sse128(dst, dst, 32 | 1);
+    dct_c_32x32_half_sse128(h, dst, dst, 32 | 1);
 }
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_64x16_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(src);
     UNUSED_PARAMETER(i_src);
     wavelet_64x16_sse128(dst);
-    dct_c_32x8_sse128(dst, dst, 32 | 0x01);
+    dct_c_32x8_sse128(h, dst, dst, 32 | 0x01);
 }
 
 /* ---------------------------------------------------------------------------
  */
-void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src)
+void dct_c_16x64_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_src)
 {
     UNUSED_PARAMETER(src);
     UNUSED_PARAMETER(i_src);
     wavelet_16x64_sse128(dst);
-    dct_c_8x32_sse128(dst, dst, 8 | 0x01);
+    dct_c_8x32_sse128(h, dst, dst, 8 | 0x01);
 }
diff --git a/source/common/vec/intrinsic_deblock.c b/source/common/vec/intrinsic_deblock.c
index 563ef0c..bb4be29 100644
--- a/source/common/vec/intrinsic_deblock.c
+++ b/source/common/vec/intrinsic_deblock.c
@@ -42,6 +42,7 @@
 #include <tmmintrin.h>
 #include <smmintrin.h>
 
+#if !HIGH_BIT_DEPTH
 void deblock_edge_ver_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag)
 {
     pel_t *pTmp = SrcPtr - 4;
@@ -805,3 +806,844 @@ void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int A
     ((int32_t*)(SrcPtrV - inc2))[0] = M128_I32(UL1, 1);
     ((int32_t*)(SrcPtrV + inc ))[0] = M128_I32(UR1, 1);
 }
+#else
+/*****************************************************************************
+*  Copyright (C) 2016 uavs2dec project,
+*  National Engineering Laboratory for Video Technology(Shenzhen),
+*  Digital Media R&D Center at Peking University Shenzhen Graduate School, China
+*  Project Leader: Ronggang Wang <rgwang@pkusz.edu.cn>
+*
+*  Main Authors: Zhenyu Wang <wangzhenyu@pkusz.edu.cn>, Kui Fan <kuifan@pku.edu.cn>
+*               Shenghao Zhang <1219759986@qq.com>Â£Â¬ Bingjie Han, Kaili Yao, Hongbin Cao,  Yueming Wang,
+*               Jing Su, Jiaying Yan, Junru Li
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at rgwang@pkusz.edu.cn.
+*****************************************************************************/
+
+void deblock_edge_ver_sse128(pel_t* SrcPtr, int stride, int Alpha, int Beta, uint8_t* flt_flag)
+{
+	pel_t* pTmp = SrcPtr - 4;
+	int flag0 = flt_flag[0] ? -1 : 0;
+	int flag1 = flt_flag[1] ? -1 : 0;
+	__m128i TL0, TL1, TL2, TL3;
+	__m128i TR0, TR1, TR2, TR3;
+	__m128i TL0l, TL1l, TL2l;
+	__m128i TR0l, TR1l, TR2l;
+	__m128i V0, V1, V2, V3, V4, V5;
+	__m128i T0, T1, T2, T3, T4, T5, T6, T7;
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7;
+	__m128i FLT_L, FLT_R, FLT, FS;
+	__m128i FS3, FS4, FS56;
+
+	__m128i ALPHA = _mm_set1_epi16((short)Alpha);
+	__m128i BETA = _mm_set1_epi16((short)Beta);
+	__m128i c_0 = _mm_set1_epi16(0);
+	__m128i c_1 = _mm_set1_epi16(1);
+	__m128i c_2 = _mm_set1_epi16(2);
+	__m128i c_3 = _mm_set1_epi16(3);
+	__m128i c_4 = _mm_set1_epi16(4);
+	__m128i c_8 = _mm_set1_epi16(8);
+	__m128i c_16 = _mm_set1_epi16(16);
+
+	T0 = _mm_loadu_si128((__m128i*)(pTmp));
+	T1 = _mm_loadu_si128((__m128i*)(pTmp + stride));
+	T2 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2));
+	T3 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3));
+	T4 = _mm_loadu_si128((__m128i*)(pTmp + stride * 4));
+	T5 = _mm_loadu_si128((__m128i*)(pTmp + stride * 5));
+	T6 = _mm_loadu_si128((__m128i*)(pTmp + stride * 6));
+	T7 = _mm_loadu_si128((__m128i*)(pTmp + stride * 7));
+
+	M0 = _mm_unpacklo_epi16(T0, T1);
+	M1 = _mm_unpackhi_epi16(T0, T1);
+	M2 = _mm_unpacklo_epi16(T2, T3);
+	M3 = _mm_unpackhi_epi16(T2, T3);
+	M4 = _mm_unpacklo_epi16(T4, T5);
+	M5 = _mm_unpackhi_epi16(T4, T5);
+	M6 = _mm_unpacklo_epi16(T6, T7);
+	M7 = _mm_unpackhi_epi16(T6, T7);
+
+	T0 = _mm_unpacklo_epi32(M0, M2);
+	T1 = _mm_unpackhi_epi32(M0, M2);
+	T2 = _mm_unpacklo_epi32(M1, M3);
+	T3 = _mm_unpackhi_epi32(M1, M3);
+	T4 = _mm_unpacklo_epi32(M4, M6);
+	T5 = _mm_unpackhi_epi32(M4, M6);
+	T6 = _mm_unpacklo_epi32(M5, M7);
+	T7 = _mm_unpackhi_epi32(M5, M7);
+
+	TL3 = _mm_unpacklo_epi64(T0, T4);
+	TL2 = _mm_unpackhi_epi64(T0, T4);
+	TR0 = _mm_unpacklo_epi64(T2, T6);
+	TR1 = _mm_unpackhi_epi64(T2, T6);
+	TL1 = _mm_unpacklo_epi64(T1, T5);
+	TL0 = _mm_unpackhi_epi64(T1, T5);
+	TR2 = _mm_unpacklo_epi64(T3, T7);
+	TR3 = _mm_unpackhi_epi64(T3, T7);
+
+#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b))
+
+	T0 = _mm_subabs_epu16(TL0, TR0);
+	T1 = _mm_cmpgt_epi16(T0, c_1);
+	T2 = _mm_cmpgt_epi16(ALPHA, T0);
+
+	M0 = _mm_set_epi32(flag1, flag1, flag0, flag0);
+	M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1
+
+	T0 = _mm_subabs_epu16(TL1, TL0);
+	T1 = _mm_subabs_epu16(TR1, TR0);
+	FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2);
+	FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2);
+
+	T0 = _mm_subabs_epu16(TL2, TL0);
+	T1 = _mm_subabs_epu16(TR2, TR0);
+	M1 = _mm_cmpgt_epi16(BETA, T0);
+	M2 = _mm_cmpgt_epi16(BETA, T1);
+	FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L);
+	FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R);
+	FLT = _mm_add_epi16(FLT_L, FLT_R);
+
+	M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1));
+	T0 = _mm_sub_epi16(FLT, c_2);
+	T1 = _mm_sub_epi16(FLT, c_3);
+	T2 = _mm_subabs_epu16(TL1, TR1);
+
+	FS56 = _mm_blendv_epi8(T1, T0, M1);
+	FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2));
+	FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2));
+
+	FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3));
+
+	FS = _mm_and_si128(FS, M0);
+
+#undef _mm_subabs_epu16
+
+
+	TL0l = TL0;
+	TL1l = TL1;
+	TR0l = TR0;
+	TR1l = TR1;
+
+	/* fs == 1 */
+	T2 = _mm_add_epi16(_mm_add_epi16(TL0l, TR0l), c_2); // L0 + R0 + 2
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0l, 1), T2), 2);
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0l, 1), T2), 2);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1));
+
+	/* fs == 2 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4
+	T3 = _mm_slli_epi16(T3, 1);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 1), _mm_add_epi16(TL1l, TR0l));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 3), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 1), _mm_add_epi16(TR1l, TL0l));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 3), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2));
+
+	/* fs == 3 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8
+	T3 = _mm_slli_epi16(T3, 1);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 2), _mm_add_epi16(TL2, TR1l));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 1), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(T0, 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 2), _mm_add_epi16(TR2, TL1l));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 1), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(T0, 4);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3));
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0l), _mm_slli_epi16(TL2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1l, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0l, 2));
+	V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0l), _mm_slli_epi16(TR2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1l, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0l, 2));
+	V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3));
+	TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3));
+
+	FS = _mm_cmpeq_epi16(FS, c_4);
+
+	if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */
+		TL2l = TL2;
+		TR2l = TR2;
+		/* cal L0/R0 */
+		T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0l, TL2l), TR0l), 3);
+		T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0l, TL2l));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR2l, 1), _mm_slli_epi16(TR2l, 2));
+		V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5);
+
+		T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0l, TR2l), TL0l), 3);
+		T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0l, TR2l));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL2l, 1), _mm_slli_epi16(TL2l, 2));
+		V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5);
+
+		TL0 = _mm_blendv_epi8(TL0, V0, FS);
+		TR0 = _mm_blendv_epi8(TR0, V1, FS);
+
+		/* cal L1/R1 */
+		T0 = _mm_slli_epi16(_mm_add_epi16(TL2l, TR0l), 1);
+		T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0l, 3), TL0l));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL2l, 2), _mm_add_epi16(TR0l, c_8));
+		V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4);
+
+		T0 = _mm_slli_epi16(_mm_add_epi16(TR2l, TL0l), 1);
+		T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0l, 3), TR0l));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR2l, 2), _mm_add_epi16(TL0l, c_8));
+		V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4);
+
+		TL1 = _mm_blendv_epi8(TL1, V2, FS);
+		TR1 = _mm_blendv_epi8(TR1, V3, FS);
+
+		/* cal L2/R2 */
+		T0 = _mm_add_epi16(_mm_slli_epi16(TL2l, 1), TL2l);
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL0l, 2), TR0l);
+		V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3);
+
+		T0 = _mm_add_epi16(_mm_slli_epi16(TR2l, 1), TR2l);
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR0l, 2), TL0l);
+		V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3);
+
+		TL2 = _mm_blendv_epi8(TL2, V4, FS);
+		TR2 = _mm_blendv_epi8(TR2, V5, FS);
+	}
+
+	/* store result */
+	M0 = _mm_unpacklo_epi16(TL3, TL2);
+	M1 = _mm_unpackhi_epi16(TL3, TL2);
+	M2 = _mm_unpacklo_epi16(TL1, TL0);
+	M3 = _mm_unpackhi_epi16(TL1, TL0);
+	M4 = _mm_unpacklo_epi16(TR0, TR1);
+	M5 = _mm_unpackhi_epi16(TR0, TR1);
+	M6 = _mm_unpacklo_epi16(TR2, TR3);
+	M7 = _mm_unpackhi_epi16(TR2, TR3);
+
+	T0 = _mm_unpacklo_epi32(M0, M2);
+	T1 = _mm_unpackhi_epi32(M0, M2);
+	T2 = _mm_unpacklo_epi32(M1, M3);
+	T3 = _mm_unpackhi_epi32(M1, M3);
+	T4 = _mm_unpacklo_epi32(M4, M6);
+	T5 = _mm_unpackhi_epi32(M4, M6);
+	T6 = _mm_unpacklo_epi32(M5, M7);
+	T7 = _mm_unpackhi_epi32(M5, M7);
+
+	M0 = _mm_unpacklo_epi64(T0, T4);
+	M1 = _mm_unpackhi_epi64(T0, T4);
+	M4 = _mm_unpacklo_epi64(T2, T6);
+	M5 = _mm_unpackhi_epi64(T2, T6);
+	M2 = _mm_unpacklo_epi64(T1, T5);
+	M3 = _mm_unpackhi_epi64(T1, T5);
+	M6 = _mm_unpacklo_epi64(T3, T7);
+	M7 = _mm_unpackhi_epi64(T3, T7);
+
+	pTmp = SrcPtr - 4;
+	_mm_storeu_si128((__m128i*)(pTmp), M0);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M1);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M2);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M3);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M4);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M5);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M6);
+	pTmp += stride;
+	_mm_storeu_si128((__m128i*)(pTmp), M7);
+}
+
+void deblock_edge_hor_sse128(pel_t* SrcPtr, int stride, int Alpha, int Beta, uint8_t* flt_flag)
+{
+	int inc = stride;
+	int inc2 = inc << 1;
+	int inc3 = inc + inc2;
+	int flag0 = flt_flag[0] ? -1 : 0;
+	int flag1 = flt_flag[1] ? -1 : 0;
+
+	__m128i TL0, TL1, TL2;
+	__m128i TR0, TR1, TR2;
+	__m128i TL0w, TL1w, TL2w, TR0w, TR1w, TR2w; //for write
+	__m128i V0, V1, V2, V3, V4, V5;
+	__m128i T0, T1, T2;
+	__m128i M0, M1, M2;
+	__m128i FLT_L, FLT_R, FLT, FS;
+	__m128i FS3, FS4, FS56;
+
+	__m128i ALPHA = _mm_set1_epi16((short)Alpha);
+	__m128i BETA = _mm_set1_epi16((short)Beta);
+	__m128i c_0 = _mm_set1_epi16(0);
+	__m128i c_1 = _mm_set1_epi16(1);
+	__m128i c_2 = _mm_set1_epi16(2);
+	__m128i c_3 = _mm_set1_epi16(3);
+	__m128i c_4 = _mm_set1_epi16(4);
+	__m128i c_8 = _mm_set1_epi16(8);
+	__m128i c_16 = _mm_set1_epi16(16);
+
+	TL2 = _mm_loadu_si128((__m128i*)(SrcPtr - inc3));
+	TL1 = _mm_loadu_si128((__m128i*)(SrcPtr - inc2));
+	TL0 = _mm_loadu_si128((__m128i*)(SrcPtr - inc));
+	TR0 = _mm_loadu_si128((__m128i*)(SrcPtr + 0));
+	TR1 = _mm_loadu_si128((__m128i*)(SrcPtr + inc));
+	TR2 = _mm_loadu_si128((__m128i*)(SrcPtr + inc2));
+
+#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b))
+
+	T0 = _mm_subabs_epu16(TL0, TR0);
+	T1 = _mm_cmpgt_epi16(T0, c_1);
+	T2 = _mm_cmpgt_epi16(ALPHA, T0);
+	M0 = _mm_set_epi32(flag1, flag1, flag0, flag0);
+	M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1
+
+	T0 = _mm_subabs_epu16(TL1, TL0);
+	T1 = _mm_subabs_epu16(TR1, TR0);
+	FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2);
+	FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2);
+
+	T0 = _mm_subabs_epu16(TL2, TL0);
+	T1 = _mm_subabs_epu16(TR2, TR0);
+	M1 = _mm_cmpgt_epi16(BETA, T0);
+	M2 = _mm_cmpgt_epi16(BETA, T1);
+	FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L);
+	FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R);
+	FLT = _mm_add_epi16(FLT_L, FLT_R);
+
+	M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1));
+	T0 = _mm_subs_epi16(FLT, c_2);
+	T1 = _mm_subs_epi16(FLT, c_3);
+	T2 = _mm_subabs_epu16(TL1, TR1);
+
+	FS56 = _mm_blendv_epi8(T1, T0, M1);
+	FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2));
+	FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2));
+
+	FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3));
+
+	FS = _mm_and_si128(FS, M0);
+
+#undef _mm_subabs_epu16
+
+	TR0w = TR0;
+	TR1w = TR1;
+	TL0w = TL0;
+	TL1w = TL1;
+
+	/* fs == 1 */
+	T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2);
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2);
+
+	TL0w = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1));
+	TR0w = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1));
+
+	/* fs == 2 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_2));
+	TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_2));
+
+	/* fs == 3 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(T0, 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(T0, 4);
+
+	TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_3));
+	TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_3));
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2));
+	V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2));
+	V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	TL1w = _mm_blendv_epi8(TL1w, V2, _mm_cmpeq_epi16(FS, c_3));
+	TR1w = _mm_blendv_epi8(TR1w, V3, _mm_cmpeq_epi16(FS, c_3));
+
+	FS = _mm_cmpeq_epi16(FS, c_4);
+
+	if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */
+		/* cal L0/R0 */
+		T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0, TL2), TR0), 3);
+		T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0, TL2));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2));
+		V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5);
+
+		T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0, TR2), TL0), 3);
+		T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0, TR2));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2));
+		V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5);
+
+		TL0w = _mm_blendv_epi8(TL0w, V0, FS);
+		TR0w = _mm_blendv_epi8(TR0w, V1, FS);
+
+		/* cal L1/R1 */
+		T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0), 1);
+		T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0, 3), TL0));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0, c_8));
+		V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4);
+
+		T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0), 1);
+		T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0, 3), TR0));
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0, c_8));
+		V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4);
+
+		TL1w = _mm_blendv_epi8(TL1w, V2, FS);
+		TR1w = _mm_blendv_epi8(TR1w, V3, FS);
+
+		/* cal L2/R2 */
+		T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2);
+		T2 = _mm_add_epi16(_mm_slli_epi16(TL0, 2), TR0);
+		V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3);
+
+		T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2);
+		T2 = _mm_add_epi16(_mm_slli_epi16(TR0, 2), TL0);
+		V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3);
+
+		TL2w = _mm_blendv_epi8(TL2, V4, FS);
+		TR2w = _mm_blendv_epi8(TR2, V5, FS);
+
+		/* store result */
+		_mm_storeu_si128((__m128i*)(SrcPtr - inc), TL0w);
+		_mm_storeu_si128((__m128i*)(SrcPtr - 0), TR0w);
+
+		_mm_storeu_si128((__m128i*)(SrcPtr - inc2), TL1w);
+		_mm_storeu_si128((__m128i*)(SrcPtr + inc), TR1w);
+
+		_mm_storeu_si128((__m128i*)(SrcPtr - inc3), TL2w);
+		_mm_storeu_si128((__m128i*)(SrcPtr + inc2), TR2w);
+	}
+	else {
+		/* store result */
+		_mm_storeu_si128((__m128i*)(SrcPtr - inc), TL0w);
+		_mm_storeu_si128((__m128i*)(SrcPtr - 0), TR0w);
+
+		_mm_storeu_si128((__m128i*)(SrcPtr - inc2), TL1w);
+		_mm_storeu_si128((__m128i*)(SrcPtr + inc), TR1w);
+	}
+
+}
+
+void deblock_edge_ver_c_sse128(pel_t* SrcPtrU, pel_t* SrcPtrV, int stride, int Alpha, int Beta, uint8_t* flt_flag)
+{
+	pel_t* pTmp;
+	int flag0 = flt_flag[0] ? -1 : 0;
+	int flag1 = flt_flag[1] ? -1 : 0;
+
+	__m128i UVL0, UVL1, UVL2, UVR0, UVR1, UVR2;
+	__m128i TL0, TL1, TL2, TL3;
+	__m128i TR0, TR1, TR2, TR3;
+	__m128i T0, T1, T2, T3, T4, T5, T6, T7;
+	__m128i V0, V1, V2, V3;
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7;
+	__m128i FLT_L, FLT_R, FLT, FS;
+	__m128i FS4, FS56;
+
+	__m128i ALPHA = _mm_set1_epi16((short)Alpha);
+	__m128i BETA = _mm_set1_epi16((short)Beta);
+	__m128i c_0 = _mm_set1_epi16(0);
+	__m128i c_1 = _mm_set1_epi16(1);
+	__m128i c_2 = _mm_set1_epi16(2);
+	__m128i c_3 = _mm_set1_epi16(3);
+	__m128i c_4 = _mm_set1_epi16(4);
+	__m128i c_8 = _mm_set1_epi16(8);
+
+	pTmp = SrcPtrU - 4;
+	T0 = _mm_loadu_si128((__m128i*)(pTmp));
+	T1 = _mm_loadu_si128((__m128i*)(pTmp + stride));
+	T2 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2));
+	T3 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3));
+
+	pTmp = SrcPtrV - 4;
+	T4 = _mm_loadu_si128((__m128i*)(pTmp));
+	T5 = _mm_loadu_si128((__m128i*)(pTmp + stride));
+	T6 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2));
+	T7 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3));
+
+	M0 = _mm_unpacklo_epi16(T0, T1);
+	M1 = _mm_unpackhi_epi16(T0, T1);
+	M2 = _mm_unpacklo_epi16(T2, T3);
+	M3 = _mm_unpackhi_epi16(T2, T3);
+	M4 = _mm_unpacklo_epi16(T4, T5);
+	M5 = _mm_unpackhi_epi16(T4, T5);
+	M6 = _mm_unpacklo_epi16(T6, T7);
+	M7 = _mm_unpackhi_epi16(T6, T7);
+
+	T0 = _mm_unpacklo_epi32(M0, M2);
+	T1 = _mm_unpackhi_epi32(M0, M2);
+	T2 = _mm_unpacklo_epi32(M1, M3);
+	T3 = _mm_unpackhi_epi32(M1, M3);
+	T4 = _mm_unpacklo_epi32(M4, M6);
+	T5 = _mm_unpackhi_epi32(M4, M6);
+	T6 = _mm_unpacklo_epi32(M5, M7);
+	T7 = _mm_unpackhi_epi32(M5, M7);
+
+	TL3 = _mm_unpacklo_epi64(T0, T4);
+	TL2 = _mm_unpackhi_epi64(T0, T4);
+	TR0 = _mm_unpacklo_epi64(T2, T6);
+	TR1 = _mm_unpackhi_epi64(T2, T6);
+	TL1 = _mm_unpacklo_epi64(T1, T5);
+	TL0 = _mm_unpackhi_epi64(T1, T5);
+	TR2 = _mm_unpacklo_epi64(T3, T7);
+	TR3 = _mm_unpackhi_epi64(T3, T7);
+
+#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b))
+
+	T0 = _mm_subabs_epu16(TL0, TR0);
+	T1 = _mm_cmpgt_epi16(T0, c_1);
+	T2 = _mm_cmpgt_epi16(ALPHA, T0);
+	M0 = _mm_set_epi32(flag1, flag0, flag1, flag0);
+	M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1
+
+	T0 = _mm_subabs_epu16(TL1, TL0);
+	T1 = _mm_subabs_epu16(TR1, TR0);
+	FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2);
+	FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2);
+
+	T0 = _mm_subabs_epu16(TL2, TL0);
+	T1 = _mm_subabs_epu16(TR2, TR0);
+	M1 = _mm_cmpgt_epi16(BETA, T0);
+	M2 = _mm_cmpgt_epi16(BETA, T1);
+	FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L);
+	FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R);
+	FLT = _mm_add_epi16(FLT_L, FLT_R);
+
+	M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1));
+	T0 = _mm_sub_epi16(FLT, c_3);
+	T1 = _mm_sub_epi16(FLT, c_4);
+	T2 = _mm_subabs_epu16(TL1, TR1);
+
+	FS56 = _mm_blendv_epi8(T1, T0, M1);
+	FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2));
+
+	FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4));
+
+	FS = _mm_and_si128(FS, M0);
+
+#undef _mm_subabs_epu16
+
+	UVL0 = TL0;
+	UVL1 = TL1;
+	UVL2 = TL2;
+	UVR0 = TR0;
+	UVR1 = TR1;
+	UVR2 = TR2;
+
+	/* fs == 1 */
+	T2 = _mm_add_epi16(_mm_add_epi16(UVL0, UVR0), c_2); // L0 + R0 + 2
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVL0, 1), T2), 2);
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVR0, 1), T2), 2);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1));
+
+	/* fs == 2 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4
+	T3 = _mm_slli_epi16(T3, 1);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 1), _mm_add_epi16(UVL1, UVR0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 3), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 1), _mm_add_epi16(UVR1, UVL0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 3), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2));
+
+	/* fs == 3 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8
+	T3 = _mm_slli_epi16(T3, 1);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 2), _mm_add_epi16(UVL2, UVR1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 1), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(T0, 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 2), _mm_add_epi16(UVR2, UVL1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 1), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(T0, 4);
+
+	TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3));
+	TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3));
+
+	T0 = _mm_add_epi16(_mm_add_epi16(UVL2, UVR0), _mm_slli_epi16(UVL2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL0, 2));
+	V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	T0 = _mm_add_epi16(_mm_add_epi16(UVR2, UVL0), _mm_slli_epi16(UVR2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR0, 2));
+	V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3));
+	TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3));
+
+	/* store result */
+	M0 = _mm_unpacklo_epi16(TL3, TL2);
+	M1 = _mm_unpackhi_epi16(TL3, TL2);
+	M2 = _mm_unpacklo_epi16(TL1, TL0);
+	M3 = _mm_unpackhi_epi16(TL1, TL0);
+	M4 = _mm_unpacklo_epi16(TR0, TR1);
+	M5 = _mm_unpackhi_epi16(TR0, TR1);
+	M6 = _mm_unpacklo_epi16(TR2, TR3);
+	M7 = _mm_unpackhi_epi16(TR2, TR3);
+
+	T0 = _mm_unpacklo_epi32(M0, M2);
+	T1 = _mm_unpackhi_epi32(M0, M2);
+	T2 = _mm_unpacklo_epi32(M1, M3);
+	T3 = _mm_unpackhi_epi32(M1, M3);
+	T4 = _mm_unpacklo_epi32(M4, M6);
+	T5 = _mm_unpackhi_epi32(M4, M6);
+	T6 = _mm_unpacklo_epi32(M5, M7);
+	T7 = _mm_unpackhi_epi32(M5, M7);
+
+	M0 = _mm_unpacklo_epi64(T0, T4);
+	M1 = _mm_unpackhi_epi64(T0, T4);
+	M4 = _mm_unpacklo_epi64(T2, T6);
+	M5 = _mm_unpackhi_epi64(T2, T6);
+	M2 = _mm_unpacklo_epi64(T1, T5);
+	M3 = _mm_unpackhi_epi64(T1, T5);
+	M6 = _mm_unpacklo_epi64(T3, T7);
+	M7 = _mm_unpackhi_epi64(T3, T7);
+
+	pTmp = SrcPtrU - 4;
+	_mm_storeu_si128((__m128i*)(pTmp), M0);
+	_mm_storeu_si128((__m128i*)(pTmp + stride), M1);
+	_mm_storeu_si128((__m128i*)(pTmp + (stride << 1)), M2);
+	_mm_storeu_si128((__m128i*)(pTmp + stride * 3), M3);
+
+	pTmp = SrcPtrV - 4;
+	_mm_storeu_si128((__m128i*)(pTmp), M4);
+	_mm_storeu_si128((__m128i*)(pTmp + stride), M5);
+	_mm_storeu_si128((__m128i*)(pTmp + (stride << 1)), M6);
+	_mm_storeu_si128((__m128i*)(pTmp + stride * 3), M7);
+}
+
+void deblock_edge_hor_c_sse128(pel_t* SrcPtrU, pel_t* SrcPtrV, int stride, int Alpha, int Beta, uint8_t* flt_flag)
+{
+	int inc = stride;
+	int inc2 = inc << 1;
+	int inc3 = inc + inc2;
+	int flag0 = flt_flag[0] ? -1 : 0;
+	int flag1 = flt_flag[1] ? -1 : 0;
+
+	__m128i UL0, UL1, UR0, UR1;
+	__m128i TL0, TL1, TL2;
+	__m128i TR0, TR1, TR2;
+	__m128i T0, T1, T2;
+	__m128i V0, V1, V2, V3;
+	__m128i M0, M1, M2;
+	__m128i FLT_L, FLT_R, FLT, FS;
+	__m128i FS4, FS56;
+
+	__m128i ALPHA = _mm_set1_epi16((short)Alpha);
+	__m128i BETA = _mm_set1_epi16((short)Beta);
+	__m128i c_0 = _mm_set1_epi16(0);
+	__m128i c_1 = _mm_set1_epi16(1);
+	__m128i c_2 = _mm_set1_epi16(2);
+	__m128i c_3 = _mm_set1_epi16(3);
+	__m128i c_4 = _mm_set1_epi16(4);
+	__m128i c_8 = _mm_set1_epi16(8);
+
+	TL0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc)));
+	TL1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc2)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc2)));
+	TL2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc3)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc3)));
+	TR0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU)), _mm_loadl_epi64((__m128i*)(SrcPtrV)));
+	TR1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU + inc)), _mm_loadl_epi64((__m128i*)(SrcPtrV + inc)));
+	TR2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU + inc2)), _mm_loadl_epi64((__m128i*)(SrcPtrV + inc2)));
+
+#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b))
+
+	T0 = _mm_subabs_epu16(TL0, TR0);
+	T1 = _mm_cmpgt_epi16(T0, c_1);
+	T2 = _mm_cmpgt_epi16(ALPHA, T0);
+
+	M0 = _mm_set_epi32(flag1, flag0, flag1, flag0);
+	M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1
+
+	T0 = _mm_subabs_epu16(TL1, TL0);
+	T1 = _mm_subabs_epu16(TR1, TR0);
+	FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2);
+	FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2);
+
+	T0 = _mm_subabs_epu16(TL2, TL0);
+	T1 = _mm_subabs_epu16(TR2, TR0);
+	M1 = _mm_cmpgt_epi16(BETA, T0);
+	M2 = _mm_cmpgt_epi16(BETA, T1);
+	FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L);
+	FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R);
+	FLT = _mm_add_epi16(FLT_L, FLT_R);
+
+	M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1));
+	T0 = _mm_subs_epi16(FLT, c_3);
+	T1 = _mm_subs_epi16(FLT, c_4);
+
+	FS56 = _mm_blendv_epi8(T1, T0, M1);
+	FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2));
+
+	FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4));
+	FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4));
+
+	FS = _mm_and_si128(FS, M0);
+
+#undef _mm_subabs_epu16
+
+	UR0 = TR0;  //UR0 TR0 to store
+	UR1 = TR1;
+	UL0 = TL0;
+	UL1 = TL1;
+
+	/* fs == 1 */
+	T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2);
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2);
+
+	UL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1));
+	UR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1));
+
+	/* fs == 2 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4);
+
+	UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_2));
+	UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_2));
+
+	/* fs == 3 */
+	T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2));
+
+	V0 = _mm_srli_epi16(T0, 4);
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1));
+
+	T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2));
+
+	V1 = _mm_srli_epi16(T0, 4);
+
+	UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_3));
+	UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_3));
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2));
+	V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3));
+	T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2));
+	V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4);
+
+	UL1 = _mm_blendv_epi8(UL1, V2, _mm_cmpeq_epi16(FS, c_3));
+	UR1 = _mm_blendv_epi8(UR1, V3, _mm_cmpeq_epi16(FS, c_3));
+
+	/* store result */
+
+	((int64_t*)(SrcPtrU - inc))[0] = _mm_extract_epi64(UL0, 0);
+	((int64_t*)(SrcPtrU))[0] = _mm_extract_epi64(UR0, 0);
+	((int64_t*)(SrcPtrU - inc2))[0] = _mm_extract_epi64(UL1, 0);
+	((int64_t*)(SrcPtrU + inc))[0] = _mm_extract_epi64(UR1, 0);
+	((int64_t*)(SrcPtrV - inc))[0] = _mm_extract_epi64(UL0, 1);
+	((int64_t*)(SrcPtrV))[0] = _mm_extract_epi64(UR0, 1);
+	((int64_t*)(SrcPtrV - inc2))[0] = _mm_extract_epi64(UL1, 1);
+	((int64_t*)(SrcPtrV + inc))[0] = _mm_extract_epi64(UR1, 1);
+}
+#endif // #if !HIGH_BIT_DEPTH
diff --git a/source/common/vec/intrinsic_idct.c b/source/common/vec/intrinsic_idct.c
index 31f4c97..8c24e51 100644
--- a/source/common/vec/intrinsic_idct.c
+++ b/source/common/vec/intrinsic_idct.c
@@ -36,6 +36,7 @@
 
 #include "../basic_types.h"
 #include "../avs2_defs.h"
+#include "../common.h"
 #include "intrinsic.h"
 
 #include <mmintrin.h>
@@ -65,12 +66,13 @@ extern ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]);
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_4x4_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_dst)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
     // const int clip_depth1 = LIMIT_BIT;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A);
     const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011);
@@ -145,12 +147,13 @@ void idct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_4x16_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_dst)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
     // const int clip_depth1 = LIMIT_BIT;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D);   //row0 87high - 90low address
     const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028);
@@ -449,12 +452,13 @@ void idct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_16x4_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_dst)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
     // const int clip_depth1 = LIMIT_BIT;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D);   //row0 87high - 90low address
     const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028);
@@ -717,12 +721,13 @@ void idct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_8x8_sse128(xavs2_t *h,
+                       const coeff_t *src, coeff_t *dst, int i_dst)
 {
     // const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
     // const int clip_depth1 = LIMIT_BIT;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     __m128i S0, S1, S2, S3, S4, S5, S6, S7;
     __m128i mAdd, T0, T1, T2, T3;
@@ -815,7 +820,7 @@ void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
     E2l = _mm_add_epi32(E2l, mAdd);
     E2h = _mm_sub_epi32(EE1h, E01h);
     E2h = _mm_add_epi32(E2h, mAdd);
-    S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5));  // Ê×´Î·´±ä»»ÒÆÎ»Êý
+    S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5));  // é¦–æ¬¡åå˜æ¢ç§»ä½æ•°
     S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5));
     S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5));
     S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5));
@@ -987,12 +992,13 @@ void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_16x16_sse128(xavs2_t *h,
+                         const coeff_t *src, coeff_t *dst, int i_dst)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth;
+    const int shift2 = 20 - h->param->input_sample_bit_depth;
     //const int clip_depth1 = LIMIT_BIT;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D);   //row0 87high - 90low address
     const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028);
@@ -1311,7 +1317,7 @@ void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
     O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
     O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
     O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- 
+
             TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
             TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
             TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
@@ -1415,13 +1421,14 @@ void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_32x32_sse128(xavs2_t *h,
+                         const coeff_t *src, coeff_t *dst, int i_dst)
 {
     int a_flag = i_dst & 0x01;
     //int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - a_flag;
+    int shift2 = 20 - h->param->input_sample_bit_depth - a_flag;
     //int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + a_flag;
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + a_flag;
 
     const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D);
     const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C);
@@ -2206,7 +2213,8 @@ void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_32x8_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_dst)
 {
     __m128i m128iS0[4], m128iS1[4], m128iS2[4], m128iS3[4], m128iS4[4], m128iS5[4], m128iS6[4], m128iS7[4];
     __m128i m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3;
@@ -2214,9 +2222,9 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
     __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l;
     __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
     //int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - (i_dst & 0x01);
+    int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01);
     //int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01);
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01);
     int i, pass;
 
     i_dst &= 0xFE;    /* remember to remove the flag bit */
@@ -2305,7 +2313,7 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
         E2h = _mm_sub_epi32(EE1h, E01h);
         E2h = _mm_add_epi32(E2h,  m128iAdd);
 
-        m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5));    // Ê×´Î·´±ä»»ÒÆÎ»Êý
+        m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5));    // é¦–æ¬¡åå˜æ¢ç§»ä½æ•°
         m128iS7[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5));
         m128iS1[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5));
         m128iS6[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5));
@@ -3044,7 +3052,8 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_8x32_sse128(xavs2_t *h,
+                        const coeff_t *src, coeff_t *dst, int i_dst)
 {
     const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D);
     const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C);
@@ -3228,9 +3237,9 @@ void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
 
     int nShift = 5, pass;
     //int shift1 = 5;
-    int shift2 = 20 - g_bit_depth - (i_dst & 0x01);
+    int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01);
     //int clip_depth1 = LIMIT_BIT;
-    int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01);
+    int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01);
 
     // DCT1
     __m128i in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15;
@@ -3805,7 +3814,7 @@ void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
             E2l = _mm_add_epi32(E2l, c32_rnd);
             E2h = _mm_sub_epi32(EE1h, E01h);
             E2h = _mm_add_epi32(E2h, c32_rnd);
-            in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift));     // Ê×´Î·´±ä»»ÒÆÎ»Êý
+            in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift));     // é¦–æ¬¡åå˜æ¢ç§»ä½æ•°
             in07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), nShift));
             in01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), nShift));
             in06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), nShift));
@@ -3966,11 +3975,12 @@ void inv_transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top
 
 /* ---------------------------------------------------------------------------
  */
-void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff)
+void inv_transform_4x4_2nd_sse128(xavs2_t *h,
+                                  coeff_t *coeff, int i_coeff)
 {
     const int shift1 = 5;
-    const int shift2 = 20 - g_bit_depth + 2;
-    const int clip_depth2 = g_bit_depth + 1;
+    const int shift2 = 20 - h->param->input_sample_bit_depth + 2;
+    const int clip_depth2 = h->param->input_sample_bit_depth + 1;
 
     /*---vertical transform first---*/
     __m128i factor = _mm_set1_epi32(1 << (shift1 - 1));         // add1
@@ -4062,23 +4072,23 @@ void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff)
     O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
     O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
     O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- 
+
 #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \
     TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \
     TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \
     TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \
     TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \
- 
+
 
 /* ---------------------------------------------------------------------------
  */
 static void inv_wavelet_64x64_sse128(coeff_t *coeff)
 {
     int i;
-    //°´ÐÐ 64*64
+    //æŒ‰è¡Œ 64*64
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8];
 
-    //°´ÁÐ 16*64
+    //æŒ‰åˆ— 16*64
     __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8];
 
     __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
@@ -4357,10 +4367,10 @@ static void inv_wavelet_64x64_sse128(coeff_t *coeff)
 static void inv_wavelet_64x16_sse128(coeff_t *coeff)
 {
     int i;
-    //°´ÐÐ 64*16
+    //æŒ‰è¡Œ 64*16
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8];
 
-    //°´ÁÐ 16*64
+    //æŒ‰åˆ— 16*64
     __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2];
 
     __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
@@ -4573,10 +4583,10 @@ static void inv_wavelet_16x64_sse128(coeff_t *coeff)
     __m128i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31;
     __m128i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63;
 
-    //°´ÐÐ 64*16
+    //æŒ‰è¡Œ 64*16
     __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8];
 
-    //°´ÁÐ 16*64
+    //æŒ‰åˆ— 16*64
     __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2];
 
     __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
@@ -4767,27 +4777,27 @@ static void inv_wavelet_16x64_sse128(coeff_t *coeff)
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_c_32x32_sse128(src, dst, 32 | 0x01); /* 32x32 idct */
+    idct_c_32x32_sse128(h, src, dst, 32 | 0x01); /* 32x32 idct */
     inv_wavelet_64x64_sse128(dst);
 }
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_c_32x8_sse128(src, dst, 32 | 0x01);
+    idct_c_32x8_sse128(h, src, dst, 32 | 0x01);
     inv_wavelet_64x16_sse128(dst);
 }
 
 /* ---------------------------------------------------------------------------
  */
-void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst)
+void idct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst)
 {
     UNUSED_PARAMETER(i_dst);
-    idct_c_8x32_sse128(src, dst, 8 | 0x01);
+    idct_c_8x32_sse128(h, src, dst, 8 | 0x01);
     inv_wavelet_16x64_sse128(dst);
 }
diff --git a/source/common/vec/intrinsic_inter_pred.c b/source/common/vec/intrinsic_inter_pred.c
index 7dddf86..55957d8 100644
--- a/source/common/vec/intrinsic_inter_pred.c
+++ b/source/common/vec/intrinsic_inter_pred.c
@@ -44,9 +44,11 @@
 #include "intrinsic.h"
 #include "avs2_defs.h"
 
+#if !HIGH_BIT_DEPTH
 /* ---------------------------------------------------------------------------
  */
-void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
+void intpl_chroma_block_hor_sse128(xavs2_t *h,
+                                   pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
 {
     const int16_t offset = 32;
     const int shift = 6;
@@ -95,7 +97,8 @@ void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src,
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
+void intpl_luma_block_hor_sse128(xavs2_t *h,
+                                 pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
 {
     int row, col = 0;
     const short offset = 32;
@@ -154,7 +157,8 @@ void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+void intpl_luma_hor_sse128(xavs2_t *h,
+                           pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
     int row, col = 0;
     const short offset = 32;
@@ -270,9 +274,10 @@ void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *
 }
 
 /* ---------------------------------------------------------------------------
- * TODO: @luofl 20170827 °´ÕÕ intpl_luma_hor_sse128() ¸ÄÐ´£¬ÒÀ´Î²åÖµ16ÁÐ
+ * TODO: @luofl 20170827 æŒ‰ç…§ intpl_luma_hor_sse128() æ”¹å†™ï¼Œä¾æ¬¡æ’å€¼16åˆ—
  */
-void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff)
+void intpl_luma_hor_x3_sse128(xavs2_t *h,
+                              pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff)
 {
     int row, col = 0;
     const short offset = 32;
@@ -557,7 +562,8 @@ void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3
     result = _mm_packus_epi16(mVal1, mVal1);
 
 
-void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
+void intpl_luma_ver_sse128(xavs2_t *h,
+                           pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff)
 {
     int row, col;
     const short offset = 32;
@@ -691,7 +697,8 @@ void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int wid
 /* ---------------------------------------------------------------------------
  *
  */
-void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff)
+void intpl_luma_ver_x3_sse128(xavs2_t *h,
+                              pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff)
 {
     /*
     intpl_luma_ver_sse128(dst0, i_dst, src, i_src, width, height, coeff[0]);
@@ -956,7 +963,8 @@ void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff)
+void intpl_luma_ext_sse128(xavs2_t *h,
+                           pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff)
 {
     int row, col;
     int shift;
@@ -1188,7 +1196,8 @@ void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int wid
     }
 }
 
-void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff)
+void intpl_luma_ext_x3_sse128(xavs2_t *h,
+                              pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff)
 {
     /*
     intpl_luma_ext_sse128(dst0, i_dst, tmp, i_tmp, width, height, coeff[0]);
@@ -1581,7 +1590,8 @@ void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
+void intpl_chroma_block_ver_sse128(xavs2_t *h,
+                                   pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
 {
     int row, col;
     const short offset = 32;
@@ -1704,7 +1714,8 @@ void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src,
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
+void intpl_luma_block_ver_sse128(xavs2_t *h,
+                                 pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff)
 {
     const short offset = 32;
     const int shift = 6;
@@ -1839,7 +1850,8 @@ void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y)
+void intpl_chroma_block_ext_sse128(xavs2_t *h,
+                                   pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y)
 {
     ALIGN16(int16_t tmp_res[(32 + 3) * 32]);
     int16_t *tmp = tmp_res;
@@ -2074,7 +2086,8 @@ void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src,
 
 /* ---------------------------------------------------------------------------
  */
-void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y)
+void intpl_luma_block_ext_sse128(xavs2_t *h,
+                                 pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y)
 {
     ALIGN16(int16_t tmp_res[(64 + 7) * 64]);
     int16_t *tmp = tmp_res;
@@ -2312,4 +2325,4 @@ void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i
         }
     }
 }
-
+#endif
diff --git a/source/common/vec/intrinsic_intra-filledge.c b/source/common/vec/intrinsic_intra-filledge.c
index 6776a5d..1bd8e3e 100644
--- a/source/common/vec/intrinsic_intra-filledge.c
+++ b/source/common/vec/intrinsic_intra-filledge.c
@@ -36,6 +36,7 @@
 
 #include "../avs2_defs.h"
 #include "../basic_types.h"
+#include "../common.h"
 #include "intrinsic.h"
 
 #include <string.h>
@@ -45,11 +46,13 @@
 #include <smmintrin.h>
 
 
+#if !HIGH_BIT_DEPTH
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚ×ó±ß½çÉÏµÄPU
+ * LCUå†…åœ¨å·¦è¾¹ç•Œä¸Šçš„PU
  */
-void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_edge_samples_0_sse128(xavs2_t *h,
+                                const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
     __m128i T0, T1;
     int i, k, j;
@@ -60,12 +63,12 @@ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill default value */
     k = ((bsy + bsx) << 1) + 1;
     j = (k >> 4) << 4;
-    T0 = _mm_set1_epi8((uint8_t)g_dc_value);
+    T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1));
     for (i = 0; i < j; i += 16) {
         _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0);
     }
-    memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1);
-    EP[2 * bsx] = (pel_t)g_dc_value;
+    memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1);
+    EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -162,9 +165,10 @@ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚ×ó±ß½çÉÏµÄPU
+ * LCUå†…åœ¨å·¦è¾¹ç•Œä¸Šçš„PU
  */
-void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_edge_samples_x_sse128(xavs2_t *h,
+                                const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
     __m128i T0, T1;
     int i, k, j;
@@ -175,12 +179,12 @@ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill default value */
     k = ((bsy + bsx) << 1) + 1;
     j = (k >> 4) << 4;
-    T0 = _mm_set1_epi8((uint8_t)g_dc_value);
+    T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1));
     for (i = 0; i < j; i += 16) {
         _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0);
     }
-    memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1);
-    EP[2 * bsx] = (pel_t)g_dc_value;
+    memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1);
+    EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -287,9 +291,10 @@ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚ×ó±ß½çÉÏµÄPU
+ * LCUå†…åœ¨å·¦è¾¹ç•Œä¸Šçš„PU
  */
-void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_edge_samples_y_sse128(xavs2_t *h,
+                                const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
     __m128i T0, T1;
     int i, k, j;
@@ -301,12 +306,12 @@ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
     /* fill default value */
     k = ((bsy + bsx) << 1) + 1;
     j = (k >> 4) << 4;
-    T0 = _mm_set1_epi8((uint8_t)g_dc_value);
+    T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1));
     for (i = 0; i < j; i += 16) {
         _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0);
     }
-    memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1);
-    EP[2 * bsx] = (pel_t)g_dc_value;
+    memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1);
+    EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -403,9 +408,10 @@ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP,
 
 /* ---------------------------------------------------------------------------
  * fill reference samples for intra prediction
- * LCUÄÚÔÚ×ó±ß½çÉÏµÄPU
+ * LCUå†…åœ¨å·¦è¾¹ç•Œä¸Šçš„PU
  */
-void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
+void fill_edge_samples_xy_sse128(xavs2_t *h,
+                                 const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy)
 {
     __m128i T0, T1;
     int i, k, j;
@@ -418,12 +424,12 @@ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
     /* fill default value */
     k = ((bsy + bsx) << 1) + 1;
     j = (k >> 4) << 4;
-    T0 = _mm_set1_epi8((uint8_t)g_dc_value);
+    T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1));
     for (i = 0; i < j; i += 16) {
         _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0);
     }
-    memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1);
-    EP[2 * bsx] = (pel_t)g_dc_value;
+    memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1);
+    EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1);
 
     /* get prediction pixels ---------------------------------------
      * extra pixels          | left-down pixels   | left pixels   | top-left | top pixels  | top-right pixels  | extra pixels
@@ -527,5 +533,5 @@ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP
         EP[0] = pL[0];
     }
 }
-
+#endif
 
diff --git a/source/common/vec/intrinsic_intra-pred.c b/source/common/vec/intrinsic_intra-pred.c
index d807f6d..ac13b20 100644
--- a/source/common/vec/intrinsic_intra-pred.c
+++ b/source/common/vec/intrinsic_intra-pred.c
@@ -36,6 +36,7 @@
 
 #include "../avs2_defs.h"
 #include "../basic_types.h"
+#include "../common.h"
 #include "intrinsic.h"
 #include <string.h>
 #include <mmintrin.h>
@@ -44,6 +45,7 @@
 #include <smmintrin.h>
 
 
+#if !HIGH_BIT_DEPTH
 static ALIGN16(int8_t tab_coeff_mode_5[8][16]) = {
     { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 },
     { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 },
@@ -63,7 +65,8 @@ static uint8_t tab_idx_mode_5[64] = {
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ver_sse128(xavs2_t *h,
+                           pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int y;
     pel_t *rpSrc = src + 1;
@@ -124,7 +127,8 @@ void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_hor_sse128(xavs2_t *h,
+                           pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int y;
     pel_t *rpSrc = src - 1;
@@ -179,7 +183,8 @@ void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_dc_sse128(xavs2_t *h,
+                          pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int avail_above = dir_mode >> 8;
     int avail_left = dir_mode & 0xFF;
@@ -251,7 +256,7 @@ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int b
     } else if (avail_above) {
         dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx);
     } else {
-        dc_value = g_dc_value;
+        dc_value = ((1 << h->param->input_sample_bit_depth) >> 1);
     }
 
     p00 = _mm_set1_epi8((pel_t)dc_value);
@@ -272,7 +277,8 @@ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int b
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_plane_sse128(xavs2_t *h,
+                             pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     pel_t  *rpSrc;
     int iH = 0;
@@ -356,7 +362,8 @@ void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_bilinear_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int x, y;
     int ishift_x = tab_log2[bsx];
@@ -543,7 +550,8 @@ void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_3_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     __m128i zero = _mm_setzero_si128();
     __m128i coeff2 = _mm_set1_epi16(2);
@@ -1111,7 +1119,8 @@ void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_4_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
 
     ALIGN16(pel_t first_line[64 + 128]);
@@ -1244,7 +1253,8 @@ void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
  */
-void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_5_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     __m128i zero = _mm_setzero_si128();
     __m128i coeff2 = _mm_set1_epi16(2);
@@ -1977,7 +1987,8 @@ void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_6_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[64 + 64]);
     int line_size = bsx + bsy - 1;
@@ -2194,7 +2205,8 @@ void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_7_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i, j;
     int iWidth2 = bsx << 1;
@@ -2521,7 +2533,8 @@ void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_8_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[2 * (64 + 48)]);
     int line_size = bsx + (bsy >> 1) - 1;
@@ -2702,7 +2715,8 @@ void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_9_sse128(xavs2_t *h,
+                               pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i, j;
     int iWidth2 = bsx << 1;
@@ -3030,7 +3044,8 @@ void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_10_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     pel_t *dst1 = dst;
@@ -3526,7 +3541,8 @@ void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_x_11_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i, j, idx;
     __m128i zero = _mm_setzero_si128();
@@ -3663,7 +3679,8 @@ void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_25_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
@@ -4152,7 +4169,8 @@ void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_26_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
@@ -4576,7 +4594,8 @@ void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_28_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[64 + 128]);
     int line_size = bsx + (bsy - 1) * 2;
@@ -4717,7 +4736,8 @@ void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_30_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[64 + 64]);
     int line_size = bsx + bsy - 1;
@@ -4956,7 +4976,8 @@ void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_31_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t dst_tran[64 * 80]);
     ALIGN16(pel_t src_tran[64 * 8]);
@@ -4970,7 +4991,7 @@ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
         src_tran[i] = src[-i];
     }
 
-    intra_pred_ang_x_5_sse128(src_tran, dst_tran, i_dst2, 5, bsy, bsx);
+    intra_pred_ang_x_5_sse128(h, src_tran, dst_tran, i_dst2, 5, bsy, bsx);
 
     if ((bsy > 4) && (bsx > 4)) {
         pel_t *pDst_128[64];
@@ -5212,7 +5233,8 @@ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_y_32_sse128(xavs2_t *h,
+                                pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[2 * (64 + 64)]);
     int line_size = (bsy >> 1) + bsx - 1;
@@ -5367,7 +5389,8 @@ void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode,
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_13_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     __m128i zero = _mm_setzero_si128();
     __m128i coeff2 = _mm_set1_epi16(2);
@@ -6383,7 +6406,8 @@ void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_14_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
     __m128i coeff2 = _mm_set1_epi16(2);
@@ -6453,7 +6477,7 @@ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
             ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00);
         }
 
-        if (i < left_size) { //Ê¹ÓÃcÓïÑÔ¿ÉÄÜ»á¸üÓÅ
+        if (i < left_size) { //ä½¿ç”¨cè¯­è¨€å¯èƒ½ä¼šæ›´ä¼˜
             __m128i p00, p01, p10;
             __m128i p20, p30;
             __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1));
@@ -6755,7 +6779,8 @@ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_16_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[2 * (64 + 48)]);
     int line_size = bsx + bsy / 2 - 1;
@@ -6925,7 +6950,8 @@ void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_18_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[64 + 64]);
     int line_size = bsx + bsy - 1;
@@ -7016,7 +7042,8 @@ void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_20_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     ALIGN16(pel_t first_line[64 + 128]);
     int left_size = (bsy - 1) * 2 + 1;
@@ -7188,7 +7215,8 @@ void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_22_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
     int i;
 
@@ -7464,7 +7492,8 @@ void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
 
 /* ---------------------------------------------------------------------------
 */
-void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
+void intra_pred_ang_xy_23_sse128(xavs2_t *h,
+                                 pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy)
 {
 
     int i;
@@ -7924,7 +7953,527 @@ void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode
             *((int*)dst) = _mm_cvtsi128_si32(M7);
         }
     }
+}
+#else
+ /*****************************************************************************
+ *  Copyright (C) 2016 uavs2dec project,
+ *  National Engineering Laboratory for Video Technology(Shenzhen),
+ *  Digital Media R&D Center at Peking University Shenzhen Graduate School, China
+ *  Project Leader: Ronggang Wang <rgwang@pkusz.edu.cn>
+ *
+ *  Main Authors: Zhenyu Wang <wangzhenyu@pkusz.edu.cn>, Kui Fan <kuifan@pku.edu.cn>
+ *               Shenghao Zhang <1219759986@qq.com>ÅÂ¬ Bingjie Han, Kaili Yao, Hongbin Cao,  Yueming Wang,
+ *               Jing Su, Jiaying Yan, Junru Li
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at rgwang@pkusz.edu.cn.
+ *****************************************************************************/
+
+void intra_pred_ver_sse128(xavs2_t *h,
+                           pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight)
+{
+    int y;
+    pel_t* rpSrc = pSrc + 1;
+    __m128i T1, T2, T3, T4;
+    __m128i M1, M2, M3, M4;
 
+    UNUSED_PARAMETER(dir_mode);
+
+    switch (iWidth) {
+    case 4:
+        for (y = 0; y < iHeight; y += 2) {
+            CP64(dst, rpSrc);
+            CP64(dst + i_dst, rpSrc);
+            dst += i_dst << 1;
+        }
+        break;
+    case 8:
+        T1 = _mm_loadu_si128((__m128i*)rpSrc);
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst), T1);
+            dst += i_dst;
+        }
+        break;
+    case 16:
+        T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0));
+        T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8));
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T1);
+            _mm_storeu_si128((__m128i*)(dst + 8), T2);
+            dst += i_dst;
+        }
+        break;
+    case 32:
+        T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0));
+        T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8));
+        T3 = _mm_loadu_si128((__m128i*)(rpSrc + 16));
+        T4 = _mm_loadu_si128((__m128i*)(rpSrc + 24));
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T1);
+            _mm_storeu_si128((__m128i*)(dst + 8), T2);
+            _mm_storeu_si128((__m128i*)(dst + 16), T3);
+            _mm_storeu_si128((__m128i*)(dst + 24), T4);
+            dst += i_dst;
+        }
+        break;
+    case 64:
+        T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0));
+        T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8));
+        T3 = _mm_loadu_si128((__m128i*)(rpSrc + 16));
+        T4 = _mm_loadu_si128((__m128i*)(rpSrc + 24));
+        M1 = _mm_loadu_si128((__m128i*)(rpSrc + 32));
+        M2 = _mm_loadu_si128((__m128i*)(rpSrc + 40));
+        M3 = _mm_loadu_si128((__m128i*)(rpSrc + 48));
+        M4 = _mm_loadu_si128((__m128i*)(rpSrc + 56));
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T1);
+            _mm_storeu_si128((__m128i*)(dst + 8), T2);
+            _mm_storeu_si128((__m128i*)(dst + 16), T3);
+            _mm_storeu_si128((__m128i*)(dst + 24), T4);
+            _mm_storeu_si128((__m128i*)(dst + 32), M1);
+            _mm_storeu_si128((__m128i*)(dst + 40), M2);
+            _mm_storeu_si128((__m128i*)(dst + 48), M3);
+            _mm_storeu_si128((__m128i*)(dst + 56), M4);
+            dst += i_dst;
+        }
+        break;
+    default:
+        assert(0);
+        break;
+    }
+}
+
+void intra_pred_hor_sse128(xavs2_t *h,
+                           pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight)
+{
+    int y;
+    pel_t* rpSrc = pSrc - 1;
+    __m128i T;
+
+    UNUSED_PARAMETER(dir_mode);
+
+    switch (iWidth) {
+    case 4:
+        for (y = 0; y < iHeight; y++) {
+            M64(dst) = 0x0001000100010001 * rpSrc[-y];
+            dst += i_dst;
+        }
+        break;
+    case 8:
+        for (y = 0; y < iHeight; y++) {
+            T = _mm_set1_epi16((pel_t)rpSrc[-y]);
+            _mm_storeu_si128((__m128i*)(dst), T);
+            dst += i_dst;
+        }
+        break;
+    case 16:
+        for (y = 0; y < iHeight; y++) {
+            T = _mm_set1_epi16((pel_t)rpSrc[-y]);
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            dst += i_dst;
+        }
+        break;
+    case 32:
+        for (y = 0; y < iHeight; y++) {
+            T = _mm_set1_epi16((pel_t)rpSrc[-y]);
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            _mm_storeu_si128((__m128i*)(dst + 16), T);
+            _mm_storeu_si128((__m128i*)(dst + 24), T);
+            dst += i_dst;
+        }
+        break;
+    case 64:
+        for (y = 0; y < iHeight; y++) {
+            T = _mm_set1_epi16((pel_t)rpSrc[-y]);
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            _mm_storeu_si128((__m128i*)(dst + 16), T);
+            _mm_storeu_si128((__m128i*)(dst + 24), T);
+            _mm_storeu_si128((__m128i*)(dst + 32), T);
+            _mm_storeu_si128((__m128i*)(dst + 40), T);
+            _mm_storeu_si128((__m128i*)(dst + 48), T);
+            _mm_storeu_si128((__m128i*)(dst + 56), T);
+            dst += i_dst;
+        }
+        break;
+    default:
+        assert(0);
+        break;
+    }
+}
+
+void intra_pred_plane_sse128(xavs2_t *h,
+                             pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight)
+{
+    pel_t* rpSrc;
+    int iH = 0;
+    int iV = 0;
+    int iA, iB, iC;
+    int x, y;
+    int iW2 = iWidth >> 1;
+    int iH2 = iHeight >> 1;
+    int ib_mult[5] = { 13, 17, 5, 11, 23 };
+    int ib_shift[5] = { 7, 10, 11, 15, 19 };
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i max_val = _mm_set1_epi16((pel_t)max_pixel);
+
+    int im_h = ib_mult[tab_log2[iWidth] - 2];
+    int is_h = ib_shift[tab_log2[iWidth] - 2];
+    int im_v = ib_mult[tab_log2[iHeight] - 2];
+    int is_v = ib_shift[tab_log2[iHeight] - 2];
+
+    int iTmp;
+    __m128i TC, TB, TA, T_Start, T, D, D1;
+
+    UNUSED_PARAMETER(dir_mode);
+
+    rpSrc = pSrc + iW2;
+    for (x = 1; x < iW2 + 1; x++) {
+        iH += x * (rpSrc[x] - rpSrc[-x]);
+    }
+
+    rpSrc = pSrc - iH2;
+    for (y = 1; y < iH2 + 1; y++) {
+        iV += y * (rpSrc[-y] - rpSrc[y]);
+    }
+
+    iA = (pSrc[-1 - (iHeight - 1)] + pSrc[1 + iWidth - 1]) << 4;
+    iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h;
+    iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v;
+
+    iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16;
+
+    TA = _mm_set1_epi32((int16_t)iTmp);
+    TB = _mm_set1_epi32((int16_t)iB);
+    TC = _mm_set1_epi32((int16_t)iC);
+
+    T_Start = _mm_set_epi32(3, 2, 1, 0);
+    T_Start = _mm_mullo_epi32(TB, T_Start);
+    T_Start = _mm_add_epi32(T_Start, TA);
+
+    TB = _mm_slli_epi32(TB, 2);
+
+    if (iWidth <= 4) {
+        for (y = 0; y < iHeight; y++) {
+            D = _mm_srai_epi32(T_Start, 5);
+            D = _mm_packus_epi32(D, D);
+            D = _mm_min_epu16(D, max_val);
+            _mm_storel_epi64((__m128i*)dst, D);
+            T_Start = _mm_add_epi32(T_Start, TC);
+            dst += i_dst;
+        }
+    }
+    else
+    {
+        for (y = 0; y < iHeight; y++) {
+            T = T_Start;
+            for (x = 0; x < iWidth; x += 8) {
+                D = _mm_srai_epi32(T, 5);
+                T = _mm_add_epi32(T, TB);
+                D1 = _mm_srai_epi32(T, 5);
+                T = _mm_add_epi32(T, TB);
+                D = _mm_packus_epi32(D, D1);
+                D = _mm_min_epu16(D, max_val);
+                _mm_storeu_si128((__m128i*)(dst + x), D);
+            }
+            T_Start = _mm_add_epi32(T_Start, TC);
+            dst += i_dst;
+        }
+    }
+}
+
+void intra_pred_bilinear_sse128(xavs2_t *h,
+                                pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight)
+{
+    int x, y;
+    int ishift_x = tab_log2[iWidth];
+    int ishift_y = tab_log2[iHeight];
+    int ishift = min(ishift_x, ishift_y);
+    int ishift_xy = ishift_x + ishift_y + 1;
+    int offset = 1 << (ishift_x + ishift_y);
+    int a, b, c, w, val;
+    pel_t* p;
+    __m128i T, T1, T2, T3, C1, C2, ADD;
+    __m128i ZERO = _mm_setzero_si128();
+    __m128i shuff = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i max_val = _mm_set1_epi16((pel_t)max_pixel);
+
+    ALIGN16(int16_t pTop[MAX_CU_SIZE + 16]);
+    ALIGN16(int16_t pLeft[MAX_CU_SIZE + 16]);
+    ALIGN16(int16_t pT[MAX_CU_SIZE + 16]);
+    ALIGN16(int16_t pL[MAX_CU_SIZE + 16]);
+    ALIGN16(int16_t wy[MAX_CU_SIZE + 16]);
+
+    UNUSED_PARAMETER(dir_mode);
+
+    a = pSrc[iWidth];
+    b = pSrc[-iHeight];
+
+    c = (iWidth == iHeight) ? (a + b + 1) >> 1 : (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6);
+    w = (c << 1) - a - b;
+
+    T = _mm_set1_epi16((int16_t)b);
+    p = pSrc + 1;
+
+    for (x = 0; x < iWidth; x += 8) {
+        T1 = _mm_loadu_si128((__m128i*)(p + x));
+        T2 = _mm_sub_epi16(T, T1);
+        T1 = _mm_slli_epi16(T1, ishift_y);
+        _mm_store_si128((__m128i*)(pT + x), T2);
+        _mm_store_si128((__m128i*)(pTop + x), T1);
+    }
+
+    T = _mm_set1_epi16((int16_t)a);
+    p = pSrc - 8;
+
+    for (y = 0; y < iHeight; y += 8) {
+        T1 = _mm_loadu_si128((__m128i*)(p - y));
+        T1 = _mm_shuffle_epi8(T1, shuff);
+        T2 = _mm_sub_epi16(T, T1);
+        T1 = _mm_slli_epi16(T1, ishift_x);
+        _mm_store_si128((__m128i*)(pL + y), T2);
+        _mm_store_si128((__m128i*)(pLeft + y), T1);
+    }
+
+    T = _mm_set1_epi16((int16_t)w);
+    T = _mm_mullo_epi16(T, _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
+    T1 = _mm_set1_epi16((int16_t)(8 * w));
+
+    for (y = 0; y < iHeight; y += 8) {
+        _mm_store_si128((__m128i*)(wy + y), T);
+        T = _mm_add_epi16(T, T1);
+    }
+
+    C1 = _mm_set_epi32(3, 2, 1, 0);
+    C2 = _mm_set1_epi32(4);
+
+    if (iWidth == 4) {
+        __m128i pTT = _mm_loadl_epi64((__m128i*)pT);
+        T = _mm_loadl_epi64((__m128i*)pTop);
+        for (y = 0; y < iHeight; y++) {
+            int add = (pL[y] << ishift_y) + wy[y];
+            ADD = _mm_set1_epi32(add);
+            ADD = _mm_mullo_epi32(C1, ADD);
+
+            val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y);
+
+            ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val));
+            T = _mm_add_epi16(T, pTT);
+
+            T1 = _mm_cvtepi16_epi32(T);
+            T1 = _mm_slli_epi32(T1, ishift_x);
+
+            T1 = _mm_add_epi32(T1, ADD);
+            T1 = _mm_srai_epi32(T1, ishift_xy);
+
+            T1 = _mm_packus_epi32(T1, T1);
+            T1 = _mm_min_epu16(T1, max_val);
+            _mm_storel_epi64((__m128i*)dst, T1);
+
+            dst += i_dst;
+        }
+    }
+    else if (iWidth == 8) {
+        __m128i pTT = _mm_load_si128((__m128i*)pT);
+        T = _mm_load_si128((__m128i*)pTop);
+        for (y = 0; y < iHeight; y++) {
+            int add = (pL[y] << ishift_y) + wy[y];
+            ADD = _mm_set1_epi32(add);
+            T3 = _mm_mullo_epi32(C2, ADD);
+            ADD = _mm_mullo_epi32(C1, ADD);
+
+            val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y);
+
+            ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val));
+
+            T = _mm_add_epi16(T, pTT);
+
+            T1 = _mm_cvtepi16_epi32(T);
+            T2 = _mm_cvtepi16_epi32(_mm_srli_si128(T, 8));
+            T1 = _mm_slli_epi32(T1, ishift_x);
+            T2 = _mm_slli_epi32(T2, ishift_x);
+
+            T1 = _mm_add_epi32(T1, ADD);
+            T1 = _mm_srai_epi32(T1, ishift_xy);
+            ADD = _mm_add_epi32(ADD, T3);
+
+            T2 = _mm_add_epi32(T2, ADD);
+            T2 = _mm_srai_epi32(T2, ishift_xy);
+            ADD = _mm_add_epi32(ADD, T3);
+
+            T1 = _mm_packus_epi32(T1, T2);
+            T1 = _mm_min_epu16(T1, max_val);
+            _mm_storeu_si128((__m128i*)dst, T1);
+
+            dst += i_dst;
+        }
+    }
+    else {
+        __m128i TT[16];
+        __m128i PTT[16];
+        for (x = 0; x < iWidth; x += 8) {
+            int idx = x >> 2;
+            __m128i M0 = _mm_load_si128((__m128i*)(pTop + x));
+            __m128i M1 = _mm_load_si128((__m128i*)(pT + x));
+            TT[idx] = _mm_unpacklo_epi16(M0, ZERO);
+            TT[idx + 1] = _mm_unpackhi_epi16(M0, ZERO);
+            PTT[idx] = _mm_cvtepi16_epi32(M1);
+            PTT[idx + 1] = _mm_cvtepi16_epi32(_mm_srli_si128(M1, 8));
+        }
+        for (y = 0; y < iHeight; y++) {
+            int add = (pL[y] << ishift_y) + wy[y];
+            ADD = _mm_set1_epi32(add);
+            T3 = _mm_mullo_epi32(C2, ADD);
+            ADD = _mm_mullo_epi32(C1, ADD);
+
+            val = ((uint16_t)pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y);
+
+            ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val));
+
+            for (x = 0; x < iWidth; x += 8) {
+                int idx = x >> 2;
+                TT[idx] = _mm_add_epi32(TT[idx], PTT[idx]);
+                TT[idx + 1] = _mm_add_epi32(TT[idx + 1], PTT[idx + 1]);
+
+                T1 = _mm_slli_epi32(TT[idx], ishift_x);
+                T2 = _mm_slli_epi32(TT[idx + 1], ishift_x);
+
+                T1 = _mm_add_epi32(T1, ADD);
+                T1 = _mm_srai_epi32(T1, ishift_xy);
+                ADD = _mm_add_epi32(ADD, T3);
+
+                T2 = _mm_add_epi32(T2, ADD);
+                T2 = _mm_srai_epi32(T2, ishift_xy);
+                ADD = _mm_add_epi32(ADD, T3);
+
+                T1 = _mm_packus_epi32(T1, T2);
+                T1 = _mm_min_epu16(T1, max_val);
+                _mm_storeu_si128((__m128i*)(dst + x), T1);
+            }
+            dst += i_dst;
+        }
+    }
+}
+
+void intra_pred_dc_sse128(xavs2_t *h,
+                          pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight)
+{
+    int bAboveAvail = dir_mode >> 8;
+    int bLeftAvail = dir_mode & 0xFF;
+
+    int   x, y;
+    int   iDCValue = 0;
+    pel_t* rpSrc = pSrc - 1;
+    int h_bitsize = tab_log2[iHeight];
+    int w_bitsize = tab_log2[iWidth];
+    int half_height = iHeight >> 1;
+    int half_width = iWidth >> 1;
+    __m128i T;
+    uint64_t v64;
+
+    if (bLeftAvail) {
+        for (y = 0; y < iHeight; y++) {
+            iDCValue += rpSrc[-y];
+        }
+
+        rpSrc = pSrc + 1;
+        if (bAboveAvail) {
+            for (x = 0; x < iWidth; x++) {
+                iDCValue += rpSrc[x];
+            }
+
+            iDCValue += ((iWidth + iHeight) >> 1);
+            iDCValue = (iDCValue * (512 / (iWidth + iHeight))) >> 9;
+        }
+        else {
+            iDCValue += half_height;
+            iDCValue >>= h_bitsize;
+        }
+    }
+    else {
+        rpSrc = pSrc + 1;
+        if (bAboveAvail) {
+            for (x = 0; x < iWidth; x++) {
+                iDCValue += rpSrc[x];
+            }
+
+            iDCValue += half_width;
+            iDCValue >>= w_bitsize;
+        }
+        else {
+            iDCValue = 1 << (h->param->input_sample_bit_depth - 1);
+        }
+    }
+
+    switch (iWidth) {
+    case 4:
+        v64 = 0x0001000100010001 * iDCValue;
+        for (y = 0; y < iHeight; y++) {
+            M64(dst) = v64;
+            dst += i_dst;
+        }
+        break;
+    case 8:
+        T = _mm_set1_epi16((pel_t)iDCValue);
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst), T);
+            dst += i_dst;
+        }
+        break;
+    case 16:
+        T = _mm_set1_epi16((pel_t)iDCValue);
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            dst += i_dst;
+        }
+        break;
+    case 32:
+        T = _mm_set1_epi16((pel_t)iDCValue);
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            _mm_storeu_si128((__m128i*)(dst + 16), T);
+            _mm_storeu_si128((__m128i*)(dst + 24), T);
+            dst += i_dst;
+        }
+        break;
+    case 64:
+        T = _mm_set1_epi16((pel_t)iDCValue);
+        for (y = 0; y < iHeight; y++) {
+            _mm_storeu_si128((__m128i*)(dst + 0), T);
+            _mm_storeu_si128((__m128i*)(dst + 8), T);
+            _mm_storeu_si128((__m128i*)(dst + 16), T);
+            _mm_storeu_si128((__m128i*)(dst + 24), T);
+            _mm_storeu_si128((__m128i*)(dst + 32), T);
+            _mm_storeu_si128((__m128i*)(dst + 40), T);
+            _mm_storeu_si128((__m128i*)(dst + 48), T);
+            _mm_storeu_si128((__m128i*)(dst + 56), T);
+            dst += i_dst;
+        }
+        break;
+    default:
+        assert(0);
+        break;
+    }
 }
+#endif // #if !HIGH_BIT_DEPTH
 
 
diff --git a/source/common/vec/intrinsic_pixel.c b/source/common/vec/intrinsic_pixel.c
index 9c5a9fa..bc97d5b 100644
--- a/source/common/vec/intrinsic_pixel.c
+++ b/source/common/vec/intrinsic_pixel.c
@@ -45,6 +45,39 @@
 
 void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height)
 {
+#if HIGH_BIT_DEPTH
+    int j;
+    __m128i D;
+
+    if (width & 7) {
+        //__m128i mask = _mm_load_si128((const __m128i *)intrinsic_mask_10bit[(width & 7) - 1]);
+        __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(width & 7) - 1]);
+
+        while (height--) {
+            for (j = 0; j < width - 7; j += 8) {
+                D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j)));
+                _mm_storeu_si128((__m128i *)(dst + j), D);
+            }
+
+            D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j)));
+            _mm_maskmoveu_si128(D, mask, (char *)&dst[j]);
+
+            src1 += i_src1;
+            src2 += i_src2;
+            dst += i_dst;
+        }
+    } else {
+        while (height--) {
+            for (j = 0; j < width; j += 8) {
+                D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j)));
+                _mm_storeu_si128((__m128i *)(dst + j), D);
+            }
+            src1 += i_src1;
+            src2 += i_src2;
+            dst += i_dst;
+        }
+    }
+#else
     int i, j;
     __m128i S1, S2, D;
 
@@ -81,7 +114,7 @@ void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1,
             dst += i_dst;
         }
     }
-
+#endif
 }
 
 /* ---------------------------------------------------------------------------
diff --git a/source/common/vec/intrinsic_sao.c b/source/common/vec/intrinsic_sao.c
index a19b76d..30c6eed 100644
--- a/source/common/vec/intrinsic_sao.c
+++ b/source/common/vec/intrinsic_sao.c
@@ -45,9 +45,12 @@
 #include <tmmintrin.h>
 #include <smmintrin.h>
 
+
+#if !HIGH_BIT_DEPTH
 /* ---------------------------------------------------------------------------
  */
-void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h,
+void SAO_on_block_sse128(xavs2_t *h,
+                         pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h,
                          int *lcu_avail, SAOBlkParam *sao_param)
 {
     int start_x, end_x, start_y, end_y;
@@ -631,7 +634,7 @@ void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i
         __m128i t0, t1, t2, t3, t4, src0, src1;
         __m128i mask ;
         __m128i shift_mask = _mm_set1_epi8(31);
-        int shift_bo = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
+        int shift_bo = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
         int end_x_16 = i_block_w - 15;
 
         r0 = _mm_set1_epi8((int8_t)(sao_param->startBand));
@@ -692,7 +695,846 @@ void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i
         exit(-1);
     }
     }
+}
+#else
+ /*****************************************************************************
+ *  Copyright (C) 2016 uavs2dec project,
+ *  National Engineering Laboratory for Video Technology(Shenzhen),
+ *  Digital Media R&D Center at Peking University Shenzhen Graduate School, China
+ *  Project Leader: Ronggang Wang <rgwang@pkusz.edu.cn>
+ *
+ *  Main Authors: Zhenyu Wang <wangzhenyu@pkusz.edu.cn>, Kui Fan <kuifan@pku.edu.cn>
+ *               Shenghao Zhang <1219759986@qq.com>ÅÂ¬ Bingjie Han, Kaili Yao, Hongbin Cao,  Yueming Wang,
+ *               Jing Su, Jiaying Yan, Junru Li
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at rgwang@pkusz.edu.cn.
+ *****************************************************************************/
 
+/* ---------------------------------------------------------------------------
+ */
+void SAO_on_block_eo_0_sse128(xavs2_t *h,
+                              pel_t* dst, int i_dst, const pel_t* src, int i_src,
+    int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset)
+{
+    int start_x, end_x;
+    int x, y;
+
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i off0, off1, off2, off3, off4;
+    __m128i s0, s1, s2;
+    __m128i t0, t1, t2, t3, t4, etype;
+    __m128i c0, c1, c2, c3, c4;
+    __m128i mask;
+    __m128i min_val = _mm_setzero_si128();
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+    int end_x_8;
+    c0 = _mm_set1_epi16(-2);
+    c1 = _mm_set1_epi16(-1);
+    c2 = _mm_set1_epi16(0);
+    c3 = _mm_set1_epi16(1);
+    c4 = _mm_set1_epi16(2);
+
+    off0 = _mm_set1_epi16((pel_t)sao_offset[0]);
+    off1 = _mm_set1_epi16((pel_t)sao_offset[1]);
+    off2 = _mm_set1_epi16((pel_t)sao_offset[2]);
+    off3 = _mm_set1_epi16((pel_t)sao_offset[3]);
+    off4 = _mm_set1_epi16((pel_t)sao_offset[4]);
+    start_x = lcu_avail[SAO_L] ? 0 : 1;
+    end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+    end_x_8 = end_x - ((end_x - start_x) & 0x07);
+
+    mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_8 - 1]));
+    if (i_block_w == 4) {
+
+
+        for (y = 0; y < i_block_h; y++) {
+            //diff = src[start_x] - src[start_x - 1];
+            //leftsign = diff > 0 ? 1 : (diff < 0 ? -1 : 0);
+            s0 = _mm_loadu_si128((__m128i*) & src[start_x - 1]);
+            s1 = _mm_srli_si128(s0, 2);
+            s2 = _mm_srli_si128(s0, 4);
+
+            t3 = _mm_min_epu16(s0, s1);
+            t1 = _mm_cmpeq_epi16(t3, s0);
+            t2 = _mm_cmpeq_epi16(t3, s1);
+            t0 = _mm_subs_epi16(t2, t1); //leftsign
+
+            t3 = _mm_min_epu16(s1, s2);
+            t1 = _mm_cmpeq_epi16(t3, s1);
+            t2 = _mm_cmpeq_epi16(t3, s2);
+            t3 = _mm_subs_epi16(t1, t2); //rightsign
+
+            etype = _mm_adds_epi16(t0, t3); //edgetype
+
+            t0 = _mm_cmpeq_epi16(etype, c0);
+            t1 = _mm_cmpeq_epi16(etype, c1);
+            t2 = _mm_cmpeq_epi16(etype, c2);
+            t3 = _mm_cmpeq_epi16(etype, c3);
+            t4 = _mm_cmpeq_epi16(etype, c4);
+
+            t0 = _mm_and_si128(t0, off0);
+            t1 = _mm_and_si128(t1, off1);
+            t2 = _mm_and_si128(t2, off2);
+            t3 = _mm_and_si128(t3, off3);
+            t4 = _mm_and_si128(t4, off4);
+
+            t0 = _mm_adds_epi16(t0, t1);
+            t2 = _mm_adds_epi16(t2, t3);
+            t0 = _mm_adds_epi16(t0, t4);
+            t0 = _mm_adds_epi16(t0, t2);//get offset
+
+            t1 = _mm_adds_epi16(t0, s1);
+            t1 = _mm_min_epi16(t1, max_val);
+            t1 = _mm_max_epi16(t1, min_val);
+            _mm_maskmoveu_si128(t1, mask, (char*)(dst));
+
+            dst += i_dst;
+            src += i_src;
+        }
+    }
+    else {
+
+        for (y = 0; y < i_block_h; y++) {
+            //diff = src[start_x] - src[start_x - 1];
+            //leftsign = diff > 0 ? 1 : (diff < 0 ? -1 : 0);
+            for (x = start_x; x < end_x; x += 8) {
+                s0 = _mm_loadu_si128((__m128i*) & src[x - 1]);
+                s1 = _mm_loadu_si128((__m128i*) & src[x]);
+                s2 = _mm_loadu_si128((__m128i*) & src[x + 1]);
+
+                t3 = _mm_min_epu16(s0, s1);
+                t1 = _mm_cmpeq_epi16(t3, s0);
+                t2 = _mm_cmpeq_epi16(t3, s1);
+                t0 = _mm_subs_epi16(t2, t1); //leftsign
+
+                t3 = _mm_min_epu16(s1, s2);
+                t1 = _mm_cmpeq_epi16(t3, s1);
+                t2 = _mm_cmpeq_epi16(t3, s2);
+                t3 = _mm_subs_epi16(t1, t2); //rightsign
+
+                etype = _mm_adds_epi16(t0, t3); //edgetype
+
+                t0 = _mm_cmpeq_epi16(etype, c0);
+                t1 = _mm_cmpeq_epi16(etype, c1);
+                t2 = _mm_cmpeq_epi16(etype, c2);
+                t3 = _mm_cmpeq_epi16(etype, c3);
+                t4 = _mm_cmpeq_epi16(etype, c4);
+
+                t0 = _mm_and_si128(t0, off0);
+                t1 = _mm_and_si128(t1, off1);
+                t2 = _mm_and_si128(t2, off2);
+                t3 = _mm_and_si128(t3, off3);
+                t4 = _mm_and_si128(t4, off4);
+
+                t0 = _mm_adds_epi16(t0, t1);
+                t2 = _mm_adds_epi16(t2, t3);
+                t0 = _mm_adds_epi16(t0, t4);
+                t0 = _mm_adds_epi16(t0, t2);//get offset
+
+                t1 = _mm_adds_epi16(t0, s1);
+                t1 = _mm_min_epi16(t1, max_val);
+                t1 = _mm_max_epi16(t1, min_val);
+
+                if (x != end_x_8) {
+                    _mm_storeu_si128((__m128i*)(dst + x), t1);
+                }
+                else {
+                    _mm_maskmoveu_si128(t1, mask, (char*)(dst + x));
+                    break;
+                }
+            }
+            dst += i_dst;
+            src += i_src;
+        }
+    }
 }
 
+void SAO_on_block_bo_sse128(xavs2_t *h,
+                            pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const SAOBlkParam* saoBlkParam)
+{
+    int x, y;
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i off0, off1, off2, off3;
+    __m128i s0, s1;
+    __m128i t0, t1, t2, t3;
+    __m128i mask;
+    __m128i min_val = _mm_setzero_si128();
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+
+    __m128i r0, r1, r2, r3;
+    int shift_bo = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT;
+    int end_x_8 = i_block_w - 7;
+
+    r0 = _mm_set1_epi16(saoBlkParam->startBand);
+    r1 = _mm_set1_epi16((saoBlkParam->startBand + 1) % 32);
+    r2 = _mm_set1_epi16(saoBlkParam->deltaBand);
+    r3 = _mm_set1_epi16((saoBlkParam->deltaBand + 1) % 32);
+    off0 = _mm_set1_epi16(saoBlkParam->offset[0]);
+    off1 = _mm_set1_epi16(saoBlkParam->offset[1]);
+    off2 = _mm_set1_epi16(saoBlkParam->offset[2]);
+    off3 = _mm_set1_epi16(saoBlkParam->offset[3]);
+
+    if (i_block_w == 4) {
+        mask = _mm_set_epi32(0, 0, -1, -1);
+
+        for (y = 0; y < i_block_h; y++) {
+            s0 = _mm_loadu_si128((__m128i*)src);
+
+            s1 = _mm_srai_epi16(s0, shift_bo);
+
+            t0 = _mm_cmpeq_epi16(s1, r0);
+            t1 = _mm_cmpeq_epi16(s1, r1);
+            t2 = _mm_cmpeq_epi16(s1, r2);
+            t3 = _mm_cmpeq_epi16(s1, r3);
+
+            t0 = _mm_and_si128(t0, off0);
+            t1 = _mm_and_si128(t1, off1);
+            t2 = _mm_and_si128(t2, off2);
+            t3 = _mm_and_si128(t3, off3);
+            t0 = _mm_or_si128(t0, t1);
+            t2 = _mm_or_si128(t2, t3);
+            t0 = _mm_or_si128(t0, t2);
+
+            t1 = _mm_adds_epi16(s0, t0);
+            t1 = _mm_min_epi16(t1, max_val);
+            t1 = _mm_max_epi16(t1, min_val);
+
+            _mm_maskmoveu_si128(t1, mask, (char *)(dst));
+
+            dst += i_dst;
+            src += i_src;
+        }
+    }
+    else {
+        mask = _mm_load_si128((const __m128i*)intrinsic_mask[(i_block_w & 7) - 1]);
+
+        for (y = 0; y < i_block_h; y++) {
+            for (x = 0; x < i_block_w; x += 8) {
+                s0 = _mm_loadu_si128((__m128i*) & src[x]);
+
+                s1 = _mm_srai_epi16(s0, shift_bo);
+
+                t0 = _mm_cmpeq_epi16(s1, r0);
+                t1 = _mm_cmpeq_epi16(s1, r1);
+                t2 = _mm_cmpeq_epi16(s1, r2);
+                t3 = _mm_cmpeq_epi16(s1, r3);
+
+                t0 = _mm_and_si128(t0, off0);
+                t1 = _mm_and_si128(t1, off1);
+                t2 = _mm_and_si128(t2, off2);
+                t3 = _mm_and_si128(t3, off3);
+                t0 = _mm_or_si128(t0, t1);
+                t2 = _mm_or_si128(t2, t3);
+                t0 = _mm_or_si128(t0, t2);
+                //src0 = _mm_adds_epi8(src0, t0);
+
+                //add 8 nums once for possible overflow
+                t1 = _mm_adds_epi16(s0, t0);
+                t1 = _mm_min_epi16(t1, max_val);
+                t1 = _mm_max_epi16(t1, min_val);
+
+                if (x < end_x_8) {
+                    _mm_storeu_si128((__m128i*) & dst[x], t1);
+                }
+                else {
+                    _mm_maskmoveu_si128(t1, mask, (char *)(dst + x));
+                }
+
+            }
 
+            dst += i_dst;
+            src += i_src;
+        }
+    }
+
+}
+
+void SAO_on_block_eo_90_sse128(xavs2_t *h,
+                               pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset)
+{
+    int start_y, end_y;
+    int x, y;
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i off0, off1, off2, off3, off4;
+    __m128i s0, s1, s2;
+    __m128i t0, t1, t2, t3, t4, etype;
+    __m128i c0, c1, c2, c3, c4;
+    __m128i mask;
+    __m128i min_val = _mm_setzero_si128();
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+    int end_x_8 = i_block_w - 7;
+    c0 = _mm_set1_epi16(-2);
+    c1 = _mm_set1_epi16(-1);
+    c2 = _mm_set1_epi16(0);
+    c3 = _mm_set1_epi16(1);
+    c4 = _mm_set1_epi16(2);
+
+    off0 = _mm_set1_epi16((pel_t)sao_offset[0]);
+    off1 = _mm_set1_epi16((pel_t)sao_offset[1]);
+    off2 = _mm_set1_epi16((pel_t)sao_offset[2]);
+    off3 = _mm_set1_epi16((pel_t)sao_offset[3]);
+    off4 = _mm_set1_epi16((pel_t)sao_offset[4]);
+    start_y = lcu_avail[SAO_T] ? 0 : 1;
+    end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1);
+
+    dst += start_y * i_dst;
+    src += start_y * i_src;
+
+    if (i_block_w == 4) {
+        mask = _mm_set_epi32(0, 0, -1, -1);
+
+        for (y = start_y; y < end_y; y++) {
+            s0 = _mm_loadu_si128((__m128i*)(src - i_src));
+            s1 = _mm_loadu_si128((__m128i*)src);
+            s2 = _mm_loadu_si128((__m128i*)(src + i_src));
+
+            t3 = _mm_min_epu16(s0, s1);
+            t1 = _mm_cmpeq_epi16(t3, s0);
+            t2 = _mm_cmpeq_epi16(t3, s1);
+            t0 = _mm_subs_epi16(t2, t1); //upsign
+
+            t3 = _mm_min_epu16(s1, s2);
+            t1 = _mm_cmpeq_epi16(t3, s1);
+            t2 = _mm_cmpeq_epi16(t3, s2);
+            t3 = _mm_subs_epi16(t1, t2); //downsign
+
+            etype = _mm_adds_epi16(t0, t3); //edgetype
+
+            t0 = _mm_cmpeq_epi16(etype, c0);
+            t1 = _mm_cmpeq_epi16(etype, c1);
+            t2 = _mm_cmpeq_epi16(etype, c2);
+            t3 = _mm_cmpeq_epi16(etype, c3);
+            t4 = _mm_cmpeq_epi16(etype, c4);
+
+            t0 = _mm_and_si128(t0, off0);
+            t1 = _mm_and_si128(t1, off1);
+            t2 = _mm_and_si128(t2, off2);
+            t3 = _mm_and_si128(t3, off3);
+            t4 = _mm_and_si128(t4, off4);
+
+            t0 = _mm_adds_epi16(t0, t1);
+            t2 = _mm_adds_epi16(t2, t3);
+            t0 = _mm_adds_epi16(t0, t4);
+            t0 = _mm_adds_epi16(t0, t2);//get offset
+
+            //add 8 nums once for possible overflow
+            t1 = _mm_adds_epi16(t0, s1);
+            t1 = _mm_min_epi16(t1, max_val);
+            t1 = _mm_max_epi16(t1, min_val);
+
+            _mm_maskmoveu_si128(t1, mask, (char *)(dst));
+
+            dst += i_dst;
+            src += i_src;
+        }
+    }
+    else {
+        if (i_block_w & 0x07) {
+            mask = _mm_set_epi32(0, 0, -1, -1);
+
+            for (y = start_y; y < end_y; y++) {
+                for (x = 0; x < i_block_w; x += 8) {
+                    s0 = _mm_loadu_si128((__m128i*) & src[x - i_src]);
+                    s1 = _mm_loadu_si128((__m128i*) & src[x]);
+                    s2 = _mm_loadu_si128((__m128i*) & src[x + i_src]);
+
+                    t3 = _mm_min_epu16(s0, s1);
+                    t1 = _mm_cmpeq_epi16(t3, s0);
+                    t2 = _mm_cmpeq_epi16(t3, s1);
+                    t0 = _mm_subs_epi16(t2, t1); //upsign
+
+                    t3 = _mm_min_epu16(s1, s2);
+                    t1 = _mm_cmpeq_epi16(t3, s1);
+                    t2 = _mm_cmpeq_epi16(t3, s2);
+                    t3 = _mm_subs_epi16(t1, t2); //downsign
+
+                    etype = _mm_adds_epi16(t0, t3); //edgetype
+
+                    t0 = _mm_cmpeq_epi16(etype, c0);
+                    t1 = _mm_cmpeq_epi16(etype, c1);
+                    t2 = _mm_cmpeq_epi16(etype, c2);
+                    t3 = _mm_cmpeq_epi16(etype, c3);
+                    t4 = _mm_cmpeq_epi16(etype, c4);
+
+                    t0 = _mm_and_si128(t0, off0);
+                    t1 = _mm_and_si128(t1, off1);
+                    t2 = _mm_and_si128(t2, off2);
+                    t3 = _mm_and_si128(t3, off3);
+                    t4 = _mm_and_si128(t4, off4);
+
+                    t0 = _mm_adds_epi16(t0, t1);
+                    t2 = _mm_adds_epi16(t2, t3);
+                    t0 = _mm_adds_epi16(t0, t4);
+                    t0 = _mm_adds_epi16(t0, t2);//get offset
+
+                    t1 = _mm_adds_epi16(t0, s1);
+                    t1 = _mm_min_epi16(t1, max_val);
+                    t1 = _mm_max_epi16(t1, min_val);
+
+                    if (x < end_x_8) {
+                        _mm_storeu_si128((__m128i*)(dst + x), t1);
+                    }
+                    else {
+                        _mm_maskmoveu_si128(t1, mask, (char *)(dst + x));
+                        break;
+                    }
+                }
+                dst += i_dst;
+                src += i_src;
+            }
+        }
+        else {
+            for (y = start_y; y < end_y; y++) {
+                for (x = 0; x < i_block_w; x += 8) {
+                    s0 = _mm_loadu_si128((__m128i*) & src[x - i_src]);
+                    s1 = _mm_loadu_si128((__m128i*) & src[x]);
+                    s2 = _mm_loadu_si128((__m128i*) & src[x + i_src]);
+
+                    t3 = _mm_min_epu16(s0, s1);
+                    t1 = _mm_cmpeq_epi16(t3, s0);
+                    t2 = _mm_cmpeq_epi16(t3, s1);
+                    t0 = _mm_subs_epi16(t2, t1); //upsign
+
+                    t3 = _mm_min_epu16(s1, s2);
+                    t1 = _mm_cmpeq_epi16(t3, s1);
+                    t2 = _mm_cmpeq_epi16(t3, s2);
+                    t3 = _mm_subs_epi16(t1, t2); //downsign
+
+                    etype = _mm_adds_epi16(t0, t3); //edgetype
+
+                    t0 = _mm_cmpeq_epi16(etype, c0);
+                    t1 = _mm_cmpeq_epi16(etype, c1);
+                    t2 = _mm_cmpeq_epi16(etype, c2);
+                    t3 = _mm_cmpeq_epi16(etype, c3);
+                    t4 = _mm_cmpeq_epi16(etype, c4);
+
+                    t0 = _mm_and_si128(t0, off0);
+                    t1 = _mm_and_si128(t1, off1);
+                    t2 = _mm_and_si128(t2, off2);
+                    t3 = _mm_and_si128(t3, off3);
+                    t4 = _mm_and_si128(t4, off4);
+
+                    t0 = _mm_adds_epi16(t0, t1);
+                    t2 = _mm_adds_epi16(t2, t3);
+                    t0 = _mm_adds_epi16(t0, t4);
+                    t0 = _mm_adds_epi16(t0, t2);//get offset
+
+                    t1 = _mm_adds_epi16(t0, s1);
+                    t1 = _mm_min_epi16(t1, max_val);
+                    t1 = _mm_max_epi16(t1, min_val);
+
+                    _mm_storeu_si128((__m128i*)(dst + x), t1);
+                }
+                dst += i_dst;
+                src += i_src;
+            }
+        }
+    }
+}
+
+void SAO_on_block_eo_135_sse128(xavs2_t *h,
+                                pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h,
+        const int* lcu_avail, const int* sao_offset)
+{
+    int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn;
+    int x, y;
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i off0, off1, off2, off3, off4;
+    __m128i s0, s1, s2;
+    __m128i t0, t1, t2, t3, t4, etype;
+    __m128i c0, c1, c2, c3, c4;
+    __m128i min_val = _mm_setzero_si128();
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+    __m128i mask_r0, mask_r, mask_rn;
+    int end_x_r0_8, end_x_r_8, end_x_rn_8;
+
+    c0 = _mm_set1_epi16(-2);
+    c1 = _mm_set1_epi16(-1);
+    c2 = _mm_set1_epi16(0);
+    c3 = _mm_set1_epi16(1);
+    c4 = _mm_set1_epi16(2);
+
+    off0 = _mm_set1_epi16((pel_t)sao_offset[0]);
+    off1 = _mm_set1_epi16((pel_t)sao_offset[1]);
+    off2 = _mm_set1_epi16((pel_t)sao_offset[2]);
+    off3 = _mm_set1_epi16((pel_t)sao_offset[3]);
+    off4 = _mm_set1_epi16((pel_t)sao_offset[4]);
+
+    start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1;
+    end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1;
+    start_x_r = lcu_avail[SAO_L] ? 0 : 1;
+    end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+    start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1);
+    end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1);
+
+    end_x_r0_8 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x07);
+    end_x_r_8 = end_x_r - ((end_x_r - start_x_r) & 0x07);
+    end_x_rn_8 = end_x_rn - ((end_x_rn - start_x_rn) & 0x07);
+
+
+    //first row
+    for (x = start_x_r0; x < end_x_r0; x += 8) {
+        s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]);
+        s1 = _mm_loadu_si128((__m128i*) & src[x]);
+        s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]);
+
+        t3 = _mm_min_epu16(s0, s1);
+        t1 = _mm_cmpeq_epi16(t3, s0);
+        t2 = _mm_cmpeq_epi16(t3, s1);
+        t0 = _mm_subs_epi16(t2, t1); //upsign
+
+        t3 = _mm_min_epu16(s1, s2);
+        t1 = _mm_cmpeq_epi16(t3, s1);
+        t2 = _mm_cmpeq_epi16(t3, s2);
+        t3 = _mm_subs_epi16(t1, t2); //downsign
+
+        etype = _mm_adds_epi16(t0, t3); //edgetype
+
+        t0 = _mm_cmpeq_epi16(etype, c0);
+        t1 = _mm_cmpeq_epi16(etype, c1);
+        t2 = _mm_cmpeq_epi16(etype, c2);
+        t3 = _mm_cmpeq_epi16(etype, c3);
+        t4 = _mm_cmpeq_epi16(etype, c4);
+
+        t0 = _mm_and_si128(t0, off0);
+        t1 = _mm_and_si128(t1, off1);
+        t2 = _mm_and_si128(t2, off2);
+        t3 = _mm_and_si128(t3, off3);
+        t4 = _mm_and_si128(t4, off4);
+
+        t0 = _mm_adds_epi16(t0, t1);
+        t2 = _mm_adds_epi16(t2, t3);
+        t0 = _mm_adds_epi16(t0, t4);
+        t0 = _mm_adds_epi16(t0, t2);//get offset
+
+
+        t1 = _mm_adds_epi16(t0, s1);
+        t1 = _mm_min_epi16(t1, max_val);
+        t1 = _mm_max_epi16(t1, min_val);
+
+        if (x != end_x_r0_8) {
+            _mm_storeu_si128((__m128i*)(dst + x), t1);
+        }
+        else {
+            mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_8 - 1]));
+            _mm_maskmoveu_si128(t1, mask_r0, (char *)(dst + x));
+            break;
+        }
+    }
+    dst += i_dst;
+    src += i_src;
+
+    mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_8 - 1]));
+    //middle rows
+    for (y = 1; y < i_block_h - 1; y++) {
+        for (x = start_x_r; x < end_x_r; x += 8) {
+            s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]);
+            s1 = _mm_loadu_si128((__m128i*) & src[x]);
+            s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]);
+
+            t3 = _mm_min_epu16(s0, s1);
+            t1 = _mm_cmpeq_epi16(t3, s0);
+            t2 = _mm_cmpeq_epi16(t3, s1);
+            t0 = _mm_subs_epi16(t2, t1); //upsign
+
+            t3 = _mm_min_epu16(s1, s2);
+            t1 = _mm_cmpeq_epi16(t3, s1);
+            t2 = _mm_cmpeq_epi16(t3, s2);
+            t3 = _mm_subs_epi16(t1, t2); //downsign
+
+            etype = _mm_adds_epi16(t0, t3); //edgetype
+
+            t0 = _mm_cmpeq_epi16(etype, c0);
+            t1 = _mm_cmpeq_epi16(etype, c1);
+            t2 = _mm_cmpeq_epi16(etype, c2);
+            t3 = _mm_cmpeq_epi16(etype, c3);
+            t4 = _mm_cmpeq_epi16(etype, c4);
+
+            t0 = _mm_and_si128(t0, off0);
+            t1 = _mm_and_si128(t1, off1);
+            t2 = _mm_and_si128(t2, off2);
+            t3 = _mm_and_si128(t3, off3);
+            t4 = _mm_and_si128(t4, off4);
+
+            t0 = _mm_adds_epi16(t0, t1);
+            t2 = _mm_adds_epi16(t2, t3);
+            t0 = _mm_adds_epi16(t0, t4);
+            t0 = _mm_adds_epi16(t0, t2);//get offset
+
+            t1 = _mm_adds_epi16(t0, s1);
+            t1 = _mm_min_epi16(t1, max_val);
+            t1 = _mm_max_epi16(t1, min_val);
+
+            if (x != end_x_r_8) {
+                _mm_storeu_si128((__m128i*)(dst + x), t1);
+            }
+            else {
+                _mm_maskmoveu_si128(t1, mask_r, (char *)(dst + x));
+                break;
+            }
+        }
+        dst += i_dst;
+        src += i_src;
+    }
+    //last row
+    for (x = start_x_rn; x < end_x_rn; x += 8) {
+        s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]);
+        s1 = _mm_loadu_si128((__m128i*) & src[x]);
+        s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]);
+
+        t3 = _mm_min_epu16(s0, s1);
+        t1 = _mm_cmpeq_epi16(t3, s0);
+        t2 = _mm_cmpeq_epi16(t3, s1);
+        t0 = _mm_subs_epi16(t2, t1); //upsign
+
+        t3 = _mm_min_epu16(s1, s2);
+        t1 = _mm_cmpeq_epi16(t3, s1);
+        t2 = _mm_cmpeq_epi16(t3, s2);
+        t3 = _mm_subs_epi16(t1, t2); //downsign
+
+        etype = _mm_adds_epi16(t0, t3); //edgetype
+
+        t0 = _mm_cmpeq_epi16(etype, c0);
+        t1 = _mm_cmpeq_epi16(etype, c1);
+        t2 = _mm_cmpeq_epi16(etype, c2);
+        t3 = _mm_cmpeq_epi16(etype, c3);
+        t4 = _mm_cmpeq_epi16(etype, c4);
+
+        t0 = _mm_and_si128(t0, off0);
+        t1 = _mm_and_si128(t1, off1);
+        t2 = _mm_and_si128(t2, off2);
+        t3 = _mm_and_si128(t3, off3);
+        t4 = _mm_and_si128(t4, off4);
+
+        t0 = _mm_adds_epi16(t0, t1);
+        t2 = _mm_adds_epi16(t2, t3);
+        t0 = _mm_adds_epi16(t0, t4);
+        t0 = _mm_adds_epi16(t0, t2);//get offset
+
+        t1 = _mm_adds_epi16(t0, s1);
+        t1 = _mm_min_epi16(t1, max_val);
+        t1 = _mm_max_epi16(t1, min_val);
+
+        if (x != end_x_rn_8) {
+            _mm_storeu_si128((__m128i*)(dst + x), t1);
+        }
+        else {
+            mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_8 - 1]));
+            _mm_maskmoveu_si128(t1, mask_rn, (char *)(dst + x));
+            break;
+        }
+    }
+}
+
+void SAO_on_block_eo_45_sse128(xavs2_t *h,
+                               pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h,
+                            const int* lcu_avail, const int* sao_offset)
+{
+    int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn;
+    int x, y;
+    int max_pixel = (1 << h->param->input_sample_bit_depth) - 1;
+    __m128i off0, off1, off2, off3, off4;
+    __m128i s0, s1, s2;
+    __m128i t0, t1, t2, t3, t4, etype;
+    __m128i c0, c1, c2, c3, c4;
+    __m128i min_val = _mm_setzero_si128();
+    __m128i max_val = _mm_set1_epi16(max_pixel);
+
+    __m128i mask_r0, mask_r, mask_rn;
+    int end_x_r0_8, end_x_r_8, end_x_rn_8;
+
+    c0 = _mm_set1_epi16(-2);
+    c1 = _mm_set1_epi16(-1);
+    c2 = _mm_set1_epi16(0);
+    c3 = _mm_set1_epi16(1);
+    c4 = _mm_set1_epi16(2);
+
+    off0 = _mm_set1_epi16((pel_t)sao_offset[0]);
+    off1 = _mm_set1_epi16((pel_t)sao_offset[1]);
+    off2 = _mm_set1_epi16((pel_t)sao_offset[2]);
+    off3 = _mm_set1_epi16((pel_t)sao_offset[3]);
+    off4 = _mm_set1_epi16((pel_t)sao_offset[4]);
+
+    start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1);
+    end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1);
+    start_x_r = lcu_avail[SAO_L] ? 0 : 1;
+    end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1);
+    start_x_rn = lcu_avail[SAO_DL] ? 0 : 1;
+    end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1;
+
+    end_x_r0_8 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x07);
+    end_x_r_8 = end_x_r - ((end_x_r - start_x_r) & 0x07);
+    end_x_rn_8 = end_x_rn - ((end_x_rn - start_x_rn) & 0x07);
+
+
+    //first row
+    for (x = start_x_r0; x < end_x_r0; x += 8) {
+        s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]);
+        s1 = _mm_loadu_si128((__m128i*) & src[x]);
+        s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]);
+
+        t3 = _mm_min_epu16(s0, s1);
+        t1 = _mm_cmpeq_epi16(t3, s0);
+        t2 = _mm_cmpeq_epi16(t3, s1);
+        t0 = _mm_subs_epi16(t2, t1); //upsign
+
+        t3 = _mm_min_epu16(s1, s2);
+        t1 = _mm_cmpeq_epi16(t3, s1);
+        t2 = _mm_cmpeq_epi16(t3, s2);
+        t3 = _mm_subs_epi16(t1, t2); //downsign
+
+        etype = _mm_adds_epi16(t0, t3); //edgetype
+
+        t0 = _mm_cmpeq_epi16(etype, c0);
+        t1 = _mm_cmpeq_epi16(etype, c1);
+        t2 = _mm_cmpeq_epi16(etype, c2);
+        t3 = _mm_cmpeq_epi16(etype, c3);
+        t4 = _mm_cmpeq_epi16(etype, c4);
+
+        t0 = _mm_and_si128(t0, off0);
+        t1 = _mm_and_si128(t1, off1);
+        t2 = _mm_and_si128(t2, off2);
+        t3 = _mm_and_si128(t3, off3);
+        t4 = _mm_and_si128(t4, off4);
+
+        t0 = _mm_adds_epi16(t0, t1);
+        t2 = _mm_adds_epi16(t2, t3);
+        t0 = _mm_adds_epi16(t0, t4);
+        t0 = _mm_adds_epi16(t0, t2);//get offset
+
+        t1 = _mm_adds_epi16(t0, s1);
+        t1 = _mm_min_epi16(t1, max_val);
+        t1 = _mm_max_epi16(t1, min_val);
+
+        if (x != end_x_r0_8) {
+            _mm_storeu_si128((__m128i*)(dst + x), t1);
+        }
+        else {
+            mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_8 - 1]));
+            _mm_maskmoveu_si128(t1, mask_r0, (char *)(dst + x));
+            break;
+        }
+    }
+    dst += i_dst;
+    src += i_src;
+
+    mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_8 - 1]));
+    //middle rows
+    for (y = 1; y < i_block_h - 1; y++) {
+        for (x = start_x_r; x < end_x_r; x += 8) {
+            s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]);
+            s1 = _mm_loadu_si128((__m128i*) & src[x]);
+            s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]);
+
+            t3 = _mm_min_epu16(s0, s1);
+            t1 = _mm_cmpeq_epi16(t3, s0);
+            t2 = _mm_cmpeq_epi16(t3, s1);
+            t0 = _mm_subs_epi16(t2, t1); //upsign
+
+            t3 = _mm_min_epu16(s1, s2);
+            t1 = _mm_cmpeq_epi16(t3, s1);
+            t2 = _mm_cmpeq_epi16(t3, s2);
+            t3 = _mm_subs_epi16(t1, t2); //downsign
+
+            etype = _mm_adds_epi16(t0, t3); //edgetype
+
+            t0 = _mm_cmpeq_epi16(etype, c0);
+            t1 = _mm_cmpeq_epi16(etype, c1);
+            t2 = _mm_cmpeq_epi16(etype, c2);
+            t3 = _mm_cmpeq_epi16(etype, c3);
+            t4 = _mm_cmpeq_epi16(etype, c4);
+
+            t0 = _mm_and_si128(t0, off0);
+            t1 = _mm_and_si128(t1, off1);
+            t2 = _mm_and_si128(t2, off2);
+            t3 = _mm_and_si128(t3, off3);
+            t4 = _mm_and_si128(t4, off4);
+
+            t0 = _mm_adds_epi16(t0, t1);
+            t2 = _mm_adds_epi16(t2, t3);
+            t0 = _mm_adds_epi16(t0, t4);
+            t0 = _mm_adds_epi16(t0, t2);//get offset
+
+            t1 = _mm_adds_epi16(t0, s1);
+            t1 = _mm_min_epi16(t1, max_val);
+            t1 = _mm_max_epi16(t1, min_val);
+
+            if (x != end_x_r_8) {
+                _mm_storeu_si128((__m128i*)(dst + x), t1);
+            }
+            else {
+                _mm_maskmoveu_si128(t1, mask_r, (char *)(dst + x));
+                break;
+            }
+        }
+        dst += i_dst;
+        src += i_src;
+    }
+    for (x = start_x_rn; x < end_x_rn; x += 8) {
+        s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]);
+        s1 = _mm_loadu_si128((__m128i*) & src[x]);
+        s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]);
+
+        t3 = _mm_min_epu16(s0, s1);
+        t1 = _mm_cmpeq_epi16(t3, s0);
+        t2 = _mm_cmpeq_epi16(t3, s1);
+        t0 = _mm_subs_epi16(t2, t1); //upsign
+
+        t3 = _mm_min_epu16(s1, s2);
+        t1 = _mm_cmpeq_epi16(t3, s1);
+        t2 = _mm_cmpeq_epi16(t3, s2);
+        t3 = _mm_subs_epi16(t1, t2); //downsign
+
+        etype = _mm_adds_epi16(t0, t3); //edgetype
+
+        t0 = _mm_cmpeq_epi16(etype, c0);
+        t1 = _mm_cmpeq_epi16(etype, c1);
+        t2 = _mm_cmpeq_epi16(etype, c2);
+        t3 = _mm_cmpeq_epi16(etype, c3);
+        t4 = _mm_cmpeq_epi16(etype, c4);
+
+        t0 = _mm_and_si128(t0, off0);
+        t1 = _mm_and_si128(t1, off1);
+        t2 = _mm_and_si128(t2, off2);
+        t3 = _mm_and_si128(t3, off3);
+        t4 = _mm_and_si128(t4, off4);
+
+        t0 = _mm_adds_epi16(t0, t1);
+        t2 = _mm_adds_epi16(t2, t3);
+        t0 = _mm_adds_epi16(t0, t4);
+        t0 = _mm_adds_epi16(t0, t2);//get offset
+
+        t1 = _mm_adds_epi16(t0, s1);
+        t1 = _mm_min_epi16(t1, max_val);
+        t1 = _mm_max_epi16(t1, min_val);
+
+        if (x != end_x_rn_8) {
+            _mm_storeu_si128((__m128i*)(dst + x), t1);
+        }
+        else {
+            mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_8 - 1]));
+            _mm_maskmoveu_si128(t1, mask_rn, (char *)(dst + x));
+            break;
+        }
+    }
+}
+#endif // !HIGH_BIT_DEPTH
diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h
index b0ec1aa..d5b4c70 100644
--- a/source/common/x86/dct8.h
+++ b/source/common/x86/dct8.h
@@ -28,14 +28,16 @@
 #define XAVS2_I386_DCT8_H
 
 #define xavs2_dct_4x4_sse2 FPFX(dct_4x4_sse2)
-void xavs2_dct_4x4_sse2   (const coeff_t *src, coeff_t *dst, int i_src);
+void xavs2_dct_4x4_sse2   (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
+#if defined(__AVX2__)
 #define xavs2_dct_4x4_avx2 FPFX(dct_4x4_avx2)
 void xavs2_dct_4x4_avx2   (const coeff_t *src, coeff_t *dst, int i_src);
+#endif
 #define xavs2_dct_8x8_sse2 FPFX(dct_8x8_sse2)
-void xavs2_dct_8x8_sse2   (const coeff_t *src, coeff_t *dst, int i_src);
+void xavs2_dct_8x8_sse2   (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
 #define xavs2_dct_8x8_sse4 FPFX(dct_8x8_sse4)
-void xavs2_dct_8x8_sse4   (const coeff_t *src, coeff_t *dst, int i_src);
-#if ARCH_X86_64
+void xavs2_dct_8x8_sse4   (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src);
+#if ARCH_X86_64 && defined(__AVX2__)
 #define xavs2_dct_8x8_avx2 FPFX(dct_8x8_avx2)
 void xavs2_dct_8x8_avx2   (const coeff_t *src, coeff_t *dst, int i_src);
 #define xavs2_dct_16x16_avx2 FPFX(dct_16x16_avx2)
@@ -45,14 +47,14 @@ void xavs2_dct_32x32_avx2 (const coeff_t *src, coeff_t *dst, int i_src);
 #endif
 
 #define xavs2_idct_4x4_sse2 FPFX(idct_4x4_sse2)
-void xavs2_idct_4x4_sse2  (const coeff_t *src, coeff_t *dst, int i_dst);
+void xavs2_idct_4x4_sse2  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
 #define xavs2_idct_8x8_ssse3 FPFX(idct_8x8_ssse3)
-void xavs2_idct_8x8_ssse3 (const coeff_t *src, coeff_t *dst, int i_dst);
-#if ARCH_X86_64
+void xavs2_idct_8x8_ssse3 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
+#define xavs2_idct_8x8_sse2 FPFX(idct_8x8_sse2)
+void xavs2_idct_8x8_sse2  (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst);
+#if ARCH_X86_64 && defined(__AVX2__)
 #define xavs2_idct_4x4_avx2 FPFX(idct_4x4_avx2)
 void xavs2_idct_4x4_avx2  (const coeff_t *src, coeff_t *dst, int i_dst);
-#define xavs2_idct_8x8_sse2 FPFX(idct_8x8_sse2)
-void xavs2_idct_8x8_sse2  (const coeff_t *src, coeff_t *dst, int i_dst);
 #define xavs2_idct_8x8_avx2 FPFX(idct_8x8_avx2)
 void xavs2_idct_8x8_avx2  (const coeff_t *src, coeff_t *dst, int i_dst);
 #define xavs2_idct_16x16_avx2 FPFX(idct_16x16_avx2)
diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h
index c70c3ef..d6b69b2 100644
--- a/source/common/x86/pixel-util.h
+++ b/source/common/x86/pixel-util.h
@@ -38,11 +38,12 @@ void xavs2_getResidual16_sse4(const pel_t *fenc, const pel_t *pred, int16_t *res
 void xavs2_getResidual32_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride);
 #define xavs2_getResidual32_sse4 FPFX(getResidual32_sse4)
 void xavs2_getResidual32_sse4(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride);
+#if defined(__AVX2__)
 #define xavs2_getResidual16_avx2 FPFX(getResidual16_avx2)
 void xavs2_getResidual16_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride);
 #define xavs2_getResidual32_avx2 FPFX(getResidual32_avx2)
 void xavs2_getResidual32_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride);
-
+#endif
 #define xavs2_transpose4_sse2 FPFX(transpose4_sse2)
 void xavs2_transpose4_sse2(pel_t *dst, const pel_t *src, intptr_t stride);
 #define xavs2_transpose8_sse2 FPFX(transpose8_sse2)
@@ -53,7 +54,7 @@ void xavs2_transpose16_sse2(pel_t *dst, const pel_t *src, intptr_t stride);
 void xavs2_transpose32_sse2(pel_t *dst, const pel_t *src, intptr_t stride);
 #define xavs2_transpose64_sse2 FPFX(transpose64_sse2)
 void xavs2_transpose64_sse2(pel_t *dst, const pel_t *src, intptr_t stride);
-
+#if defined(__AVX2__)
 #define xavs2_transpose8_avx2 FPFX(transpose8_avx2)
 void xavs2_transpose8_avx2(pel_t *dst, const pel_t *src, intptr_t stride);
 #define xavs2_transpose16_avx2 FPFX(transpose16_avx2)
@@ -62,7 +63,7 @@ void xavs2_transpose16_avx2(pel_t *dst, const pel_t *src, intptr_t stride);
 void xavs2_transpose32_avx2(pel_t *dst, const pel_t *src, intptr_t stride);
 #define xavs2_transpose64_avx2 FPFX(transpose64_avx2)
 void xavs2_transpose64_avx2(pel_t *dst, const pel_t *src, intptr_t stride);
-
+#endif
 #define xavs2_count_nonzero_4x4_ssse3 FPFX(count_nonzero_4x4_ssse3)
 int xavs2_count_nonzero_4x4_ssse3(const int16_t *quantCoeff);
 #define xavs2_count_nonzero_8x8_ssse3 FPFX(count_nonzero_8x8_ssse3)
@@ -71,6 +72,7 @@ int xavs2_count_nonzero_8x8_ssse3(const int16_t *quantCoeff);
 int xavs2_count_nonzero_16x16_ssse3(const int16_t *quantCoeff);
 #define xavs2_count_nonzero_32x32_ssse3 FPFX(count_nonzero_32x32_ssse3)
 int xavs2_count_nonzero_32x32_ssse3(const int16_t *quantCoeff);
+#if defined(__AVX2__)
 #define xavs2_count_nonzero_4x4_avx2 FPFX(count_nonzero_4x4_avx2)
 int xavs2_count_nonzero_4x4_avx2(const int16_t *quantCoeff);
 #define xavs2_count_nonzero_8x8_avx2 FPFX(count_nonzero_8x8_avx2)
@@ -79,11 +81,11 @@ int xavs2_count_nonzero_8x8_avx2(const int16_t *quantCoeff);
 int xavs2_count_nonzero_16x16_avx2(const int16_t *quantCoeff);
 #define xavs2_count_nonzero_32x32_avx2 FPFX(count_nonzero_32x32_avx2)
 int xavs2_count_nonzero_32x32_avx2(const int16_t *quantCoeff);
-
-#define xavs2_weight_pp_sse4 FPFX(weight_pp_sse4)
-void xavs2_weight_pp_sse4(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 #define xavs2_weight_pp_avx2 FPFX(weight_pp_avx2)
 void xavs2_weight_pp_avx2(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
+#endif
+#define xavs2_weight_pp_sse4 FPFX(weight_pp_sse4)
+void xavs2_weight_pp_sse4(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 #define xavs2_weight_sp_sse4 FPFX(weight_sp_sse4)
 void xavs2_weight_sp_sse4(const int16_t *src, pel_t *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 
@@ -103,17 +105,18 @@ float xavs2_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
 
 #define xavs2_scale1D_128to64_ssse3 FPFX(scale1D_128to64_ssse3)
 void xavs2_scale1D_128to64_ssse3(pel_t*, const pel_t*);
-#define xavs2_scale1D_128to64_avx2 FPFX(scale1D_128to64_avx2)
-void xavs2_scale1D_128to64_avx2(pel_t*, const pel_t*);
 #define xavs2_scale2D_64to32_ssse3 FPFX(scale2D_64to32_ssse3)
 void xavs2_scale2D_64to32_ssse3(pel_t*, const pel_t*, intptr_t);
+#if defined(__AVX2__)
+#define xavs2_scale1D_128to64_avx2 FPFX(scale1D_128to64_avx2)
+void xavs2_scale1D_128to64_avx2(pel_t*, const pel_t*);
 #define xavs2_scale2D_64to32_avx2 FPFX(scale2D_64to32_avx2)
 void xavs2_scale2D_64to32_avx2(pel_t*, const pel_t*, intptr_t);
-
-#define xavs2_scanPosLast_x64 FPFX(scanPosLast_x64)
-int xavs2_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize);
 #define xavs2_scanPosLast_avx2_bmi2 FPFX(scanPosLast_avx2_bmi2)
 int xavs2_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize);
+#endif
+#define xavs2_scanPosLast_x64 FPFX(scanPosLast_x64)
+int xavs2_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize);
 #define xavs2_findPosFirstLast_ssse3 FPFX(findPosFirstLast_ssse3)
 uint32_t xavs2_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
@@ -123,7 +126,8 @@ uint32_t xavs2_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, int
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1);
+    void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(xavs2_t *h, pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1);
+
 
 #define CHROMA_PIXELSUB_DEF(cpu) \
     SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
@@ -139,7 +143,7 @@ uint32_t xavs2_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, int
 
 #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1);
+    void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(xavs2_t *h, pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1);
 
 #define LUMA_PIXELSUB_DEF(cpu) \
     SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \
diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h
index 06370b7..266e4cc 100644
--- a/source/common/x86/pixel.h
+++ b/source/common/x86/pixel.h
@@ -114,7 +114,7 @@
     FUNCDEF_PU(void,        pixel_sad_x3, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*,               intptr_t, int32_t*);\
     FUNCDEF_PU(void,        pixel_sad_x4, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*, const pel_t*, intptr_t, int32_t*);\
     FUNCDEF_PU(void,        pixel_avg,    cpu, pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int);\
-    FUNCDEF_PU(void,        pixel_add_ps, cpu, pel_t* a,   intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);\
+    FUNCDEF_PU(void,        pixel_add_ps, cpu, xavs2_t *h, pel_t* a,   intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);\
     FUNCDEF_PU(void,        pixel_sub_ps, cpu, coeff_t* a, intptr_t dstride, const pel_t* b0, const pel_t*   b1, intptr_t sstride0, intptr_t sstride1);\
     FUNCDEF_PU(int,         pixel_satd,   cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\
     FUNCDEF_PU(int,         pixel_sad,    cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\
diff --git a/source/common/x86/pixeladd8.asm b/source/common/x86/pixeladd8.asm
index a0f2fb5..2d39648 100644
--- a/source/common/x86/pixeladd8.asm
+++ b/source/common/x86/pixeladd8.asm
@@ -34,11 +34,11 @@ SECTION .text
 cextern pw_pixel_max
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_4x4(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x4, 6, 6, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m1,     [pw_pixel_max]
     pxor    m0,     m0
     add     r4,     r4
@@ -68,7 +68,7 @@ cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcSt
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x4, 6, 6, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     add         r5,         r5
     pmovzxbw    m0,         [r2]
     pmovzxbw    m2,         [r2 + r4]
@@ -101,12 +101,12 @@ cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcSt
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_4x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W4_H4 2
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m1,     [pw_pixel_max]
     pxor    m0,     m0
     mov     r6d,    %2/4
@@ -143,7 +143,7 @@ cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcS
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/4
     add         r5,         r5
 .loop:
@@ -187,12 +187,12 @@ PIXEL_ADD_PS_W4_H4   4, 16
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_8x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W8_H4 2
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_8x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %2/4
@@ -235,7 +235,7 @@ cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcS
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_8x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/4
     add         r5,         r5
 .loop:
@@ -280,12 +280,12 @@ PIXEL_ADD_PS_W8_H4 8, 32
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_16x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W16_H4 2
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %2/4
@@ -352,7 +352,7 @@ cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/4
     add         r5,         r5
 .loop:
@@ -413,13 +413,13 @@ PIXEL_ADD_PS_W16_H4 16, 32
 PIXEL_ADD_PS_W16_H4 16, 64
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_16x16(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W16_H4_avx2 1
 %if HIGH_BIT_DEPTH
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%1, 6, 10, 4, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m3,     [pw_pixel_max]
     pxor    m2,     m2
     mov     r6d,    %1/4
@@ -464,7 +464,7 @@ cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, sr
 %endif
 %else
 INIT_YMM avx2
-cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%1, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %1/4
     add         r5,         r5
 .loop:
@@ -519,12 +519,12 @@ PIXEL_ADD_PS_W16_H4_avx2 64
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_32x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W32_H2 2
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %2/2
@@ -588,7 +588,7 @@ cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/2
     add         r5,         r5
 .loop:
@@ -644,13 +644,13 @@ PIXEL_ADD_PS_W32_H2 32, 32
 PIXEL_ADD_PS_W32_H2 32, 64
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_32x32(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W32_H4_avx2 1
 %if HIGH_BIT_DEPTH
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%1, 6, 10, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %1/4
@@ -716,7 +716,7 @@ cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, sr
 %else
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%1, 6, 10, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %1/4
     add         r5,         r5
     lea         r7,         [r4 * 3]
@@ -786,12 +786,12 @@ PIXEL_ADD_PS_W32_H4_avx2 64
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_64x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W64_H2 2
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %2/2
@@ -903,7 +903,7 @@ cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src
     RET
 %else
 INIT_XMM sse4
-cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/2
     add         r5,         r5
 .loop:
@@ -995,13 +995,13 @@ PIXEL_ADD_PS_W64_H2 64, 16
 PIXEL_ADD_PS_W64_H2 64, 64
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_64x64(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W64H4_avx2 1
 %if HIGH_BIT_DEPTH
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%1, 6, 10, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mova    m5,     [pw_pixel_max]
     pxor    m4,     m4
     mov     r6d,    %1/4
@@ -1110,7 +1110,7 @@ cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, sr
 %endif
 %else
 INIT_YMM avx2
-cglobal pixel_add_ps_64x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%1, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %1/2
     add         r5,         r5
 .loop:
diff --git a/source/configw.h b/source/configw.h
index 0021edd..cb7d12e 100644
--- a/source/configw.h
+++ b/source/configw.h
@@ -37,7 +37,7 @@
 #ifndef XAVS2_CONFIGW_H
 #define XAVS2_CONFIGW_H
 
-#if defined(__ICL) || defined(_MSC_VER)
+#if defined(__ICL) || defined(_MSC_VER) || defined(__MINGW64_VERSION_MAJOR)
 
 /* arch */
 #define ARCH_X86                1
@@ -57,7 +57,7 @@
 #ifndef __SSE__
 #define __SSE__
 #endif
-#define HAVE_MMX                1     /* X86     */
+#define HAVE_MMX                0     /* X86     */
 #define HAVE_ALTIVEC            0     /* ALTIVEC */
 #define HAVE_ALTIVEC_H          0
 #define HAVE_ARMV6              0
diff --git a/source/encoder/alf.c b/source/encoder/alf.c
index 4e69076..9826e23 100644
--- a/source/encoder/alf.c
+++ b/source/encoder/alf.c
@@ -99,17 +99,17 @@ typedef struct dh_nc {
 } DhNc;
 
 typedef struct {
-    int64_t     m_autoCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];          // auto-correlation matrix
-    double      m_crossCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF];          // cross-correlation
-    double      pixAcc[NO_VAR_BINS];
+    long long int m_autoCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];          // auto-correlation matrix
+    double        m_crossCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF];          // cross-correlation
+    double        pixAcc[NO_VAR_BINS];
 } AlfCorrData;
 
 typedef struct {
     double      m_cross_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF];
-    int64_t     m_auto_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];
+    long long int m_auto_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];
     double      m_cross_temp[ALF_MAX_NUM_COEF];
     double      m_pixAcc_merged[NO_VAR_BINS];
-    int64_t     m_auto_temp[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];
+    long long int m_auto_temp[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];
 
     int         m_coeffNoFilter[NO_VAR_BINS][ALF_MAX_NUM_COEF];
     int         m_filterCoeffSym[NO_VAR_BINS][ALF_MAX_NUM_COEF];
@@ -231,9 +231,9 @@ void copyALFparam(ALFParam *dst, ALFParam *src, int componentID)
  * calculate the correlation matrix for Luma
  */
 static
-void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i_org, pel_t *rec, int i_rec,
+void calcCorrOneCompRegionLuma8(xavs2_t *h, alf_ctx_t *Enc_ALF, pel8_t *org, int i_org, pel8_t *rec, int i_rec,
                                int yPos, int xPos, int height, int width,
-                               int64_t m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+                               long long int m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                                double m_crossCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                                double *pixAcc,
                                int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail)
@@ -245,15 +245,114 @@ void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i
     int endPosLuma   = isBelowAvail ? (yPos + height - 4) : (yPos + height);
     int xOffSetLeft  = isLeftAvail  ? -3 : 0;
     int xOffSetRight = isRightAvail ?  3 : 0;
-    pel_t *imgPad = rec;
-    pel_t *imgOrg = org;
+    pel8_t *imgPad = rec;
+    pel8_t *imgOrg = org;
     int yUp, yBottom;
     int xLeft, xRight;
 
     int ELocal[ALF_MAX_NUM_COEF];
-    pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
+    pel8_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
     int i, j, k, l, yLocal, varInd;
-    int64_t(*E)[9];
+    long long int(*E)[9];
+    double *yy;
+
+    imgPad += startPosLuma * i_rec;
+    imgOrg += startPosLuma * i_org;
+
+    varInd = Enc_ALF->tab_lcu_region[(yPos >> h->i_lcu_level) * h->i_width_in_lcu + (xPos >> h->i_lcu_level)];
+    int step = 1;
+    if (IS_ALG_ENABLE(OPT_FAST_ALF)) {
+        step = 2;
+    }
+    for (i = startPosLuma; i < endPosLuma; i += step) {
+        yUp     = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 1);
+        yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 1);
+        imgPad1 = imgPad + (yBottom - i) * i_rec;
+        imgPad2 = imgPad + (yUp - i) * i_rec;
+
+        yUp     = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 2);
+        yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 2);
+        imgPad3 = imgPad + (yBottom - i) * i_rec;
+        imgPad4 = imgPad + (yUp - i) * i_rec;
+
+        yUp     = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 3);
+        yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 3);
+        imgPad5 = imgPad + (yBottom - i) * i_rec;
+        imgPad6 = imgPad + (yUp - i) * i_rec;
+
+        for (j = xPos; j < xPosEnd; j += step) {
+            memset(ELocal, 0, N * sizeof(int));
+
+            ELocal[0] = (imgPad5[j] + imgPad6[j]);
+            ELocal[1] = (imgPad3[j] + imgPad4[j]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1);
+            ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]);
+            ELocal[3] = (imgPad1[j  ] + imgPad2[j  ]);
+            ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]);
+            ELocal[7] = (imgPad[xRight] + imgPad[xLeft]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2);
+            ELocal[6] = (imgPad[xRight] + imgPad[xLeft]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3);
+            ELocal[5] = (imgPad[xRight] + imgPad[xLeft]);
+            ELocal[8] = (imgPad[j  ]);
+
+            yLocal = imgOrg[j];
+            pixAcc[varInd] += (yLocal * yLocal);
+            E  = m_autoCorr[varInd];
+            yy = m_crossCorr[varInd];
+
+            for (k = 0; k < N; k++) {
+                for (l = k; l < N; l++) {
+                    E[k][l] += (ELocal[k] * ELocal[l]);
+                }
+                yy[k] += (double)(ELocal[k] * yLocal);
+            }
+        }
+
+        imgPad += i_rec;
+        imgOrg += i_org;
+    }
+
+    for (varInd = 0; varInd < NO_VAR_BINS; varInd++) {
+        E = m_autoCorr[varInd];
+        for (k = 1; k < N; k++) {
+            for (l = 0; l < k; l++) {
+                E[k][l] = E[l][k];
+            }
+        }
+    }
+}
+
+static
+void calcCorrOneCompRegionLuma10(xavs2_t *h, alf_ctx_t *Enc_ALF, pel10_t *org, int i_org, pel10_t *rec, int i_rec,
+                               int yPos, int xPos, int height, int width,
+                               long long int m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+                               double m_crossCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+                               double *pixAcc,
+                               int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail)
+{
+    int xPosEnd = xPos + width;
+    int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0];
+
+    int startPosLuma = isAboveAvail ? (yPos - 4) : yPos;
+    int endPosLuma   = isBelowAvail ? (yPos + height - 4) : (yPos + height);
+    int xOffSetLeft  = isLeftAvail  ? -3 : 0;
+    int xOffSetRight = isRightAvail ?  3 : 0;
+    pel10_t *imgPad = rec;
+    pel10_t *imgOrg = org;
+    int yUp, yBottom;
+    int xLeft, xRight;
+
+    int ELocal[ALF_MAX_NUM_COEF];
+    pel10_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
+    int i, j, k, l, yLocal, varInd;
+    long long int(*E)[9];
     double *yy;
 
     imgPad += startPosLuma * i_rec;
@@ -333,8 +432,8 @@ void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i
  * calculate the correlation matrix for Chroma
  */
 static
-void calcCorrOneCompRegionChma(xavs2_t *h, pel_t *org, int i_org, pel_t *rec, int i_rec, int yPos, int xPos, int height, int width,
-                               int64_t m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr,
+void calcCorrOneCompRegionChma8(xavs2_t *h, pel8_t *org, int i_org, pel8_t *rec, int i_rec, int yPos, int xPos, int height, int width,
+                               long long int m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr,
                                int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail)
 {
     int xPosEnd = xPos + width;
@@ -344,13 +443,102 @@ void calcCorrOneCompRegionChma(xavs2_t *h, pel_t *org, int i_org, pel_t *rec, in
     int endPosChroma   = isBelowAvail ? (yPos + height - 4) : (yPos + height);
     int xOffSetLeft    = isLeftAvail  ? -3 : 0;
     int xOffSetRight   = isRightAvail ?  3 : 0;
-    pel_t *imgPad = rec;
-    pel_t *imgOrg = org;
+    pel8_t *imgPad = rec;
+    pel8_t *imgOrg = org;
     int yUp, yBottom;
     int xLeft, xRight;
 
     int ELocal[ALF_MAX_NUM_COEF];
-    pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
+    pel8_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
+    int i, j, k, l, yLocal;
+
+    imgPad += startPosChroma * i_rec;
+    imgOrg += startPosChroma * i_org;
+
+    int step = 1;
+    if (IS_ALG_ENABLE(OPT_FAST_ALF)) {
+        step = 2;
+    }
+    for (i = startPosChroma; i < endPosChroma; i += step) {
+        yUp     = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 1);
+        yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 1);
+        imgPad1 = imgPad + (yBottom - i) * i_rec;
+        imgPad2 = imgPad + (yUp - i) * i_rec;
+
+        yUp     = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 2);
+        yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 2);
+        imgPad3 = imgPad + (yBottom - i) * i_rec;
+        imgPad4 = imgPad + (yUp - i) * i_rec;
+
+        yUp     = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 3);
+        yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 3);
+        imgPad5 = imgPad + (yBottom - i) * i_rec;
+        imgPad6 = imgPad + (yUp - i) * i_rec;
+
+        for (j = xPos; j < xPosEnd; j += step) {
+            memset(ELocal, 0, N * sizeof(int));
+
+            ELocal[0] = (imgPad5[j] + imgPad6[j]);
+            ELocal[1] = (imgPad3[j] + imgPad4[j]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1);
+            ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]);
+            ELocal[3] = (imgPad1[j  ] + imgPad2[j  ]);
+            ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]);
+            ELocal[7] = (imgPad[xRight] + imgPad[xLeft]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2);
+            ELocal[6] = (imgPad[xRight] + imgPad[xLeft]);
+
+            xLeft  = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3);
+            xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3);
+            ELocal[5] = (imgPad[xRight] + imgPad[xLeft]);
+            ELocal[8] = (imgPad[j  ]);
+
+            yLocal = (int)imgOrg[j];
+
+            for (k = 0; k < N; k++) {
+                m_autoCorr[k][k] += ELocal[k] * ELocal[k];
+                for (l = k + 1; l < N; l++) {
+                    m_autoCorr[k][l] += ELocal[k] * ELocal[l];
+                }
+
+                m_crossCorr[k] += yLocal * ELocal[k];
+            }
+        }
+
+        imgPad += i_rec;
+        imgOrg += i_org;
+    }
+
+    for (j = 0; j < N - 1; j++) {
+        for (i = j + 1; i < N; i++) {
+            m_autoCorr[i][j] = m_autoCorr[j][i];
+        }
+    }
+}
+
+static
+void calcCorrOneCompRegionChma10(xavs2_t *h, pel10_t *org, int i_org, pel10_t *rec, int i_rec, int yPos, int xPos, int height, int width,
+                               long long int m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr,
+                               int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail)
+{
+    int xPosEnd = xPos + width;
+    const int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0];
+
+    int startPosChroma = isAboveAvail ? (yPos - 4) : yPos;
+    int endPosChroma   = isBelowAvail ? (yPos + height - 4) : (yPos + height);
+    int xOffSetLeft    = isLeftAvail  ? -3 : 0;
+    int xOffSetRight   = isRightAvail ?  3 : 0;
+    pel10_t *imgPad = rec;
+    pel10_t *imgOrg = org;
+    int yUp, yBottom;
+    int xLeft, xRight;
+
+    int ELocal[ALF_MAX_NUM_COEF];
+    pel10_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6;
     int i, j, k, l, yLocal;
 
     imgPad += startPosChroma * i_rec;
@@ -451,7 +639,7 @@ void deriveBoundaryAvail(xavs2_t *h, int pic_x, int pic_y,
     int size_lcu = 1 << h->i_lcu_level;
     int mb_x, mb_y;
     //int pic_mb_width = h->i_width_in_mincu;
-    //cu_info_t *cuCurr, *cuLeft, *cuRight, *cuAbove, *cuBelow; 
+    //cu_info_t *cuCurr, *cuLeft, *cuRight, *cuAbove, *cuBelow;
 
     mb_x      = pic_x >> MIN_CU_SIZE_IN_BIT;
     mb_y      = pic_y >> MIN_CU_SIZE_IN_BIT;
@@ -514,8 +702,9 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y,
 
     reset_alfCorr(alfCorr, compIdx);
     formatShift = 1;
-    calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx],
-                              p_rec->planes[compIdx], p_rec->i_stride[compIdx],
+    if (h->param->input_sample_bit_depth == 8) {
+    calcCorrOneCompRegionChma8(h, p_org->planes8[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes8[compIdx], p_rec->i_stride[compIdx],
                               ctuYPos >> formatShift, ctuXPos >> formatShift,
                               ctuHeight >> formatShift, ctuWidth >> formatShift,
                               alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0],
@@ -524,9 +713,9 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y,
     compIdx = IMG_V;
     alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu];
     reset_alfCorr(alfCorr, compIdx);
-    //V·ÖÁ¿µÄypos, xpos, height, widthËÄ¸öÖµÓëU·ÖÁ¿Ò»Ñù£¬²»ÐèÒªÐÞ¸Ä
-    calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx],
-                              p_rec->planes[compIdx], p_rec->i_stride[compIdx],
+    //Våˆ†é‡çš„ypos, xpos, height, widthå››ä¸ªå€¼ä¸ŽUåˆ†é‡ä¸€æ ·ï¼Œä¸éœ€è¦ä¿®æ”¹
+    calcCorrOneCompRegionChma8(h, p_org->planes8[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes8[compIdx], p_rec->i_stride[compIdx],
                               ctuYPos >> formatShift, ctuXPos >> formatShift,
                               ctuHeight >> formatShift, ctuWidth >> formatShift,
                               alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0],
@@ -536,12 +725,42 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y,
     alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu];
     reset_alfCorr(alfCorr, compIdx);
     formatShift = 0;
-    calcCorrOneCompRegionLuma(h, Enc_ALF, p_org->planes[compIdx], p_org->i_stride[compIdx],
-                              p_rec->planes[compIdx], p_rec->i_stride[compIdx],
+    calcCorrOneCompRegionLuma8(h, Enc_ALF, p_org->planes8[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes8[compIdx], p_rec->i_stride[compIdx],
                               ctuYPos >> formatShift, ctuXPos >> formatShift,
                               ctuHeight >> formatShift, ctuWidth >> formatShift,
                               alfCorr->m_autoCorr, alfCorr->m_crossCorr, alfCorr->pixAcc,
                               isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail);
+    } else {
+    calcCorrOneCompRegionChma10(h, p_org->planes10[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes10[compIdx], p_rec->i_stride[compIdx],
+                              ctuYPos >> formatShift, ctuXPos >> formatShift,
+                              ctuHeight >> formatShift, ctuWidth >> formatShift,
+                              alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0],
+                              isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail);
+
+    compIdx = IMG_V;
+    alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu];
+    reset_alfCorr(alfCorr, compIdx);
+    //Våˆ†é‡çš„ypos, xpos, height, widthå››ä¸ªå€¼ä¸ŽUåˆ†é‡ä¸€æ ·ï¼Œä¸éœ€è¦ä¿®æ”¹
+    calcCorrOneCompRegionChma10(h, p_org->planes10[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes10[compIdx], p_rec->i_stride[compIdx],
+                              ctuYPos >> formatShift, ctuXPos >> formatShift,
+                              ctuHeight >> formatShift, ctuWidth >> formatShift,
+                              alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0],
+                              isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail);
+
+    compIdx = IMG_Y;
+    alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu];
+    reset_alfCorr(alfCorr, compIdx);
+    formatShift = 0;
+    calcCorrOneCompRegionLuma10(h, Enc_ALF, p_org->planes10[compIdx], p_org->i_stride[compIdx],
+                              p_rec->planes10[compIdx], p_rec->i_stride[compIdx],
+                              ctuYPos >> formatShift, ctuXPos >> formatShift,
+                              ctuHeight >> formatShift, ctuWidth >> formatShift,
+                              alfCorr->m_autoCorr, alfCorr->m_crossCorr, alfCorr->pixAcc,
+                              isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail);
+    }
 }
 
 
@@ -560,7 +779,7 @@ static
 void mergeFrom(AlfCorrData *dst, AlfCorrData *src, int *mergeTable, int doPixAccMerge, int componentID)
 {
     int numCoef = ALF_MAX_NUM_COEF;
-    int64_t (*srcE)[ALF_MAX_NUM_COEF], (*dstE)[ALF_MAX_NUM_COEF];
+    long long int (*srcE)[ALF_MAX_NUM_COEF], (*dstE)[ALF_MAX_NUM_COEF];
     double *srcy, *dsty;
     int maxFilterSetSize, j, i, varInd, filtIdx;
 
@@ -651,7 +870,7 @@ static uint32_t estimateALFBitrateInPicHeader(ALFParam *alfPicParam)
  */
 static
 long xFastFiltDistEstimation(alf_ctx_t *Enc_ALF,
-                             int64_t ppdE[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+                             long long int ppdE[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                              double *pdy, int *piCoeff, int iFiltLength)
 {
     //static memory
@@ -711,13 +930,73 @@ long estimateFilterDistortion(alf_ctx_t *Enc_ALF, int compIdx, AlfCorrData *alfC
 /* ---------------------------------------------------------------------------
  */
 static
-dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx,
+dist_t calcAlfLCUDist8(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx,
+                      int ypos, int xpos, int height, int width, int isAboveAvail,
+                      pel8_t *picSrc, int i_src, pel8_t *picCmp, int i_cmp)
+{
+    dist_t dist = 0;
+    pel8_t *pelCmp = picCmp;
+    pel8_t *pelSrc = picSrc;
+
+    int notSkipLinesRightVB = TRUE;
+    int notSkipLinesBelowVB = TRUE;
+    //int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight;
+
+    //numLCUInPicHeight  = h->i_height_in_lcu;
+    //numLCUInPicWidth   = h->i_width_in_lcu;
+    //NumCUsInFrame      = numLCUInPicHeight * numLCUInPicWidth;
+
+    switch (compIdx) {
+    case IMG_U:
+    case IMG_V:
+        if (!notSkipLinesBelowVB) {
+            height = height - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1);
+        }
+
+        if (!notSkipLinesRightVB) {
+            width = width - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1);
+        }
+
+        if (isAboveAvail) {
+            pelSrc += ((ypos - 4) * i_src) + xpos;
+            pelCmp += ((ypos - 4) * i_cmp) + xpos;
+        } else {
+            pelSrc += (ypos * i_src) + xpos;
+            pelCmp += (ypos * i_cmp) + xpos;
+        }
+        break;
+    default:
+        // case IMG_Y:
+        if (!notSkipLinesBelowVB) {
+            height = height - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1);
+        }
+
+        if (!notSkipLinesRightVB) {
+            width = width - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1);
+        }
+
+        pelCmp = picCmp + (ypos * i_cmp) + xpos;
+        pelSrc = picSrc + (ypos * i_src) + xpos;
+        break;
+    }
+    if (PART_INDEX(width, height) == LUMA_INVALID) {
+        uint32_t uiShift = Enc_ALF->m_uiBitIncrement << 1;
+        dist += g_funcs.pixf.ssd_block8(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift;
+    } else {
+        dist += g_funcs.pixf.ssd8[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp);
+    }
+
+    return dist;
+}
+
+static
+dist_t calcAlfLCUDist10(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx,
                       int ypos, int xpos, int height, int width, int isAboveAvail,
-                      pel_t *picSrc, int i_src, pel_t *picCmp, int i_cmp)
+                      pel10_t *picSrc, int i_src, pel10_t *picCmp, int i_cmp)
 {
     dist_t dist = 0;
-    pel_t *pelCmp = picCmp;
-    pel_t *pelSrc = picSrc;
+    pel10_t *pelCmp = picCmp;
+    pel10_t *pelSrc = picSrc;
 
     int notSkipLinesRightVB = TRUE;
     int notSkipLinesBelowVB = TRUE;
@@ -762,9 +1041,9 @@ dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx,
     }
     if (PART_INDEX(width, height) == LUMA_INVALID) {
         uint32_t uiShift = Enc_ALF->m_uiBitIncrement << 1;
-        dist += g_funcs.pixf.ssd_block(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift;
+        dist += g_funcs.pixf.ssd_block10(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift;
     } else {
-        dist += g_funcs.pixf.ssd[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp);
+        dist += g_funcs.pixf.ssd10[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp);
     }
 
     return dist;
@@ -774,7 +1053,7 @@ dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx,
  * ALF filter on CTB
  */
 static
-void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,
+void filterOneCTB8(xavs2_t *h, alf_ctx_t *Enc_ALF, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src,
                   int compIdx, ALFParam *alfParam, int ypos, int height, int xpos, int width,
                   int isAboveAvail, int isBelowAvail)
 {
@@ -792,11 +1071,37 @@ void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t
         coef = Enc_ALF->m_filterCoeffSym[0];
     }
 
+    g_funcs.alf_flt8[0](h, p_dst, i_dst, p_src, i_src,
+                       xpos, ypos, width, height, coef,
+                       isAboveAvail, isBelowAvail);
+    g_funcs.alf_flt8[1](h, p_dst, i_dst, p_src, i_src,
+                       xpos, ypos, width, height, coef,
+                       isAboveAvail, isBelowAvail);
+}
+
+static
+void filterOneCTB10(xavs2_t *h, alf_ctx_t *Enc_ALF, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src,
+                  int compIdx, ALFParam *alfParam, int ypos, int height, int xpos, int width,
+                  int isAboveAvail, int isBelowAvail)
+{
+    int *coef;
+
+    //reconstruct coefficients to m_filterCoeffSym and m_varIndTab
+    reconstructCoefInfo(compIdx, alfParam, Enc_ALF->m_filterCoeffSym, Enc_ALF->m_varIndTab); //reconstruct ALF coefficients & related parameters
+
+    //derive CTB start positions, width, and height. If the boundary is not available, skip boundary samples.
+
+    if (compIdx == IMG_Y) {
+        int var = Enc_ALF->tab_lcu_region[(ypos >> h->i_lcu_level) * h->i_width_in_lcu + (xpos >> h->i_lcu_level)];
+        coef = Enc_ALF->m_filterCoeffSym[Enc_ALF->m_varIndTab[var]];
+    } else {
+        coef = Enc_ALF->m_filterCoeffSym[0];
+    }
 
-    g_funcs.alf_flt[0](p_dst, i_dst, p_src, i_src,
+    g_funcs.alf_flt10[0](h, p_dst, i_dst, p_src, i_src,
                        xpos, ypos, width, height, coef,
                        isAboveAvail, isBelowAvail);
-    g_funcs.alf_flt[1](p_dst, i_dst, p_src, i_src,
+    g_funcs.alf_flt10[1](h, p_dst, i_dst, p_src, i_src,
                        xpos, ypos, width, height, coef,
                        isAboveAvail, isBelowAvail);
 }
@@ -804,7 +1109,7 @@ void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t
 /* ---------------------------------------------------------------------------
  */
 static ALWAYS_INLINE
-void copyOneAlfBlk(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int ypos, int xpos,
+void copyOneAlfBlk8(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, int ypos, int xpos,
                    int height, int width, int isAboveAvail, int isBelowAvail)
 {
     int startPos  = isAboveAvail ? (ypos          - 4) : ypos;
@@ -812,7 +1117,19 @@ void copyOneAlfBlk(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int ypos, i
     p_dst += (startPos * i_dst) + xpos;
     p_src += (startPos * i_src) + xpos;
 
-    g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, width, endPos - startPos);
+    g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, width, endPos - startPos);
+}
+
+static ALWAYS_INLINE
+void copyOneAlfBlk10(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, int ypos, int xpos,
+                   int height, int width, int isAboveAvail, int isBelowAvail)
+{
+    int startPos  = isAboveAvail ? (ypos          - 4) : ypos;
+    int endPos    = isBelowAvail ? (ypos + height - 4) : ypos + height;
+    p_dst += (startPos * i_dst) + xpos;
+    p_src += (startPos * i_src) + xpos;
+
+    g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, width, endPos - startPos);
 }
 
 /* ---------------------------------------------------------------------------
@@ -941,9 +1258,10 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL
     int i_org = 0;
     int i_rec_before = 0;
     int i_rec_after = 0;
-    pel_t *p_org_pixel = NULL;
-    pel_t *p_rec_before = NULL;
-    pel_t *p_rec_after = NULL;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_org_pixel = NULL;
+    pel8_t *p_rec_before = NULL;
+    pel8_t *p_rec_after = NULL;
     double lambda_luma, lambda_chroma;
     int img_height, img_width;
     int size_lcu = 1 << h->i_lcu_level;
@@ -988,20 +1306,20 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL
                 }
 
                 formatShift = (compIdx == IMG_Y) ? 0 : 1;
-                p_org_pixel = p_org->planes[compIdx];
+                p_org_pixel = p_org->planes8[compIdx];
                 i_org = p_org->i_stride[compIdx];
-                p_rec_before = p_rec->planes[compIdx];
+                p_rec_before = p_rec->planes8[compIdx];
                 i_rec_before = p_rec->i_stride[compIdx];
-                p_rec_after = p_dst->planes[compIdx];
+                p_rec_after = p_dst->planes8[compIdx];
                 i_rec_after = p_dst->i_stride[compIdx];
 
                 // ALF on
-                filterOneCTB(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx,
+                filterOneCTB8(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx,
                              &alfPictureParam[compIdx], ctuYPos >> formatShift, ctuHeight >> formatShift,
                              ctuXPos >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail);
-                distEnc = calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
+                distEnc = calcAlfLCUDist8(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
                                          ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_after, i_rec_after);
-                distEnc -= calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
+                distEnc -= calcAlfLCUDist8(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
                                           ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_before, i_rec_before);
 
                 h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr);
@@ -1022,7 +1340,7 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL
                 h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE;
 
                 if (!h->is_alf_lcu_on[ctu][compIdx]) {
-                    copyOneAlfBlk(p_rec_after, i_rec_after, p_rec_before, i_rec_before,
+                    copyOneAlfBlk8(h, p_rec_after, i_rec_after, p_rec_before, i_rec_before,
                                   ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift,
                                   isAboveAvail, isBelowAvail);
                 }
@@ -1060,12 +1378,139 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL
                     h->is_alf_lcu_on[ctu][compIdx] = FALSE;
                 }
 
-                g_funcs.plane_copy(p_dst->planes[compIdx], p_dst->i_stride[compIdx],
-                                   p_rec->planes[compIdx], p_rec->i_stride[compIdx],
+                g_funcs.plane_copy8(h, p_dst->planes8[compIdx], p_dst->i_stride[compIdx],
+                                   p_rec->planes8[compIdx], p_rec->i_stride[compIdx],
                                    p_rec->i_width[compIdx], p_rec->i_lines[compIdx]);
             }
         }
     }
+    } else {
+    pel10_t *p_org_pixel = NULL;
+    pel10_t *p_rec_before = NULL;
+    pel10_t *p_rec_after = NULL;
+    double lambda_luma, lambda_chroma;
+    int img_height, img_width;
+    int size_lcu = 1 << h->i_lcu_level;
+    int ctux, ctuy;
+    int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight;
+    int rate, noFilters;
+
+    h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_initial);
+    h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec);
+
+    img_height = h->i_height;
+    img_width = h->i_width;
+    numLCUInPicHeight = h->i_height_in_lcu;
+    numLCUInPicWidth = h->i_width_in_lcu;
+    NumCUsInFrame = numLCUInPicHeight * numLCUInPicWidth;
+
+    lambda_luma = lambda; //VKTBD lambda is not correct
+    lambda_chroma = LAMBDA_SCALE_CHROMA * lambda_luma;
+    for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) {
+        distBestPic[compIdx] = 0;
+        rateBestPic[compIdx] = 0;
+    }
+
+    for (ctuy = 0, ctu = 0; ctuy < numLCUInPicHeight; ctuy++) {
+        //derive CTU height
+        ctuYPos = ctuy * size_lcu;
+        ctuHeight = XAVS2_MIN(img_height - ctuYPos, size_lcu);
+        for (ctux = 0; ctux < numLCUInPicWidth; ctux++, ctu++) {
+            //derive CTU width
+            ctuXPos = ctux * size_lcu;
+            ctuWidth = XAVS2_MIN(img_width - ctuXPos, size_lcu);
+
+            //derive CTU boundary availabilities
+            deriveBoundaryAvail(h, ctuXPos, ctuYPos,
+                                &isLeftAvail, &isRightAvail, &isAboveAvail, &isBelowAvail);
+
+            for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) {
+                //if slice-level enabled flag is 0, set CTB-level enabled flag 0
+                if (alfPictureParam[compIdx].alf_flag == 0) {
+                    h->is_alf_lcu_on[ctu][compIdx] = FALSE;
+                    continue;
+                }
+
+                formatShift = (compIdx == IMG_Y) ? 0 : 1;
+                p_org_pixel = p_org->planes10[compIdx];
+                i_org = p_org->i_stride[compIdx];
+                p_rec_before = p_rec->planes10[compIdx];
+                i_rec_before = p_rec->i_stride[compIdx];
+                p_rec_after = p_dst->planes10[compIdx];
+                i_rec_after = p_dst->i_stride[compIdx];
+
+                // ALF on
+                filterOneCTB10(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx,
+                             &alfPictureParam[compIdx], ctuYPos >> formatShift, ctuHeight >> formatShift,
+                             ctuXPos >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail);
+                distEnc = calcAlfLCUDist10(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
+                                         ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_after, i_rec_after);
+                distEnc -= calcAlfLCUDist10(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift,
+                                          ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_before, i_rec_before);
+
+                h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr);
+
+                rateEnc = p_aec->binary.write_alf_lcu_ctrl(p_aec, 1);
+
+                costEnc = (double)distEnc + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateEnc;
+
+                // ALF off
+                distOff = 0;
+                //rateOff = 1;
+                h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr);
+                rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, 0);
+
+                costOff = (double)distOff + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateOff;
+
+                //set CTB-level on/off flag
+                h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE;
+
+                if (!h->is_alf_lcu_on[ctu][compIdx]) {
+                    copyOneAlfBlk10(h, p_rec_after, i_rec_after, p_rec_before, i_rec_before,
+                                  ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift,
+                                  isAboveAvail, isBelowAvail);
+                }
+
+                //update CABAC status
+                //cabacCoder->updateAlfCtrlFlagState(m_pcPic->getCU(ctu)->getAlfLCUEnabled(compIdx)?1:0);
+
+                h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr);
+                rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, (h->is_alf_lcu_on[ctu][compIdx] ? 1 : 0));
+                h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec);
+
+                rateBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? rateEnc : rateOff);
+                distBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? distEnc : distOff);
+
+            } //CTB
+        }
+    } //CTU
+
+    for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) {
+        if (alfPictureParam[compIdx].alf_flag == 1) {
+            double Lambda = (compIdx == 0 ? lambda_luma : lambda_chroma);
+            rate = ALFParamBitrateEstimate(&alfPictureParam[compIdx]);
+            if (compIdx == IMG_Y) {
+                noFilters = alfPictureParam[0].filters_per_group - 1;
+                rate += uvlc_bitrate_estimate[noFilters] + (4 * noFilters);
+            }
+            costAlfOn = (double)distBestPic[compIdx] + Lambda *
+                        (rateBestPic[compIdx] + (double)(rate));
+
+            costAlfOff = 0;
+
+            if (costAlfOn >= costAlfOff) {
+                alfPictureParam[compIdx].alf_flag = 0;
+                for (ctu = 0; ctu < NumCUsInFrame; ctu++) {
+                    h->is_alf_lcu_on[ctu][compIdx] = FALSE;
+                }
+
+                g_funcs.plane_copy10(h, p_dst->planes10[compIdx], p_dst->i_stride[compIdx],
+                                   p_rec->planes10[compIdx], p_rec->i_stride[compIdx],
+                                   p_rec->i_width[compIdx], p_rec->i_lines[compIdx]);
+            }
+        }
+    }
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1209,7 +1654,7 @@ static void gnsBacksubstitution(double R[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], do
 
 /* ---------------------------------------------------------------------------
  */
-static int gnsCholeskyDec(int64_t inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double outMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int noEq)
+static int gnsCholeskyDec(long long int inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double outMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int noEq)
 {
     int i, j, k;        /* Looping Variables */
     double scale;       /* scaling factor for each row */
@@ -1245,7 +1690,7 @@ static int gnsCholeskyDec(int64_t inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], d
 
 /* ---------------------------------------------------------------------------
  */
-static int gnsSolveByChol(int64_t LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *rhs, double *x, int noEq)
+static int gnsSolveByChol(long long int LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *rhs, double *x, int noEq)
 {
     double aux[ALF_MAX_NUM_COEF];     /* Auxiliary vector */
     double U[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF];    /* Upper triangular Cholesky factor of LHS */
@@ -1291,7 +1736,7 @@ static int gnsSolveByChol(int64_t LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], doubl
 
 /* ---------------------------------------------------------------------------
  */
-static double calculateErrorAbs(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double y, int size)
+static double calculateErrorAbs(long long int A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double y, int size)
 {
     int i;
     double error, sum;
@@ -1311,7 +1756,7 @@ static double calculateErrorAbs(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], d
 /* ---------------------------------------------------------------------------
  */
 static
-double mergeFiltersGreedy(alf_ctx_t *Enc_ALF, double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+double mergeFiltersGreedy(alf_ctx_t *Enc_ALF, double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                           double *pixAccGlobalSeq, int intervalBest[NO_VAR_BINS][2], int sqrFiltLength, int noIntervals)
 {
     int first, ind, ind1, ind2, i, j, bestToMerge;
@@ -1465,7 +1910,7 @@ static double xfindBestCoeffCodMethod(int filterCoeffSymQuant[][ALF_MAX_NUM_COEF
 
 /* ---------------------------------------------------------------------------
  */
-static void add_A(int64_t Amerged[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t A[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size)
+static void add_A(int64_t Amerged[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int A[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size)
 {
     int i, j, ind;
 
@@ -1527,7 +1972,7 @@ static double calculateErrorCoeffProvided(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NU
 
 /* ---------------------------------------------------------------------------
  */
-static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant, int64_t E[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *y, int sqrFiltLength)
+static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant, long long int E[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *y, int sqrFiltLength)
 {
     double error;
     int filterCoeffQuantMod[ALF_MAX_NUM_COEF];
@@ -1598,7 +2043,7 @@ static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant
 
 /* ---------------------------------------------------------------------------
  */
-static double findFilterCoeff(alf_ctx_t *Enc_ALF, int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+static double findFilterCoeff(alf_ctx_t *Enc_ALF, long long int EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                               double *pixAccGlobalSeq, int filterCoeffSeq[][ALF_MAX_NUM_COEF], int filterCoeffQuantSeq[][ALF_MAX_NUM_COEF], int intervalBest[NO_VAR_BINS][2],
                               int varIndTab[NO_VAR_BINS], int sqrFiltLength, int filters_per_fr, double errorTabForce0Coeff[NO_VAR_BINS][2])
 {
@@ -1635,7 +2080,7 @@ static double findFilterCoeff(alf_ctx_t *Enc_ALF, int64_t EGlobalSeq[][ALF_MAX_N
 /* ---------------------------------------------------------------------------
  */
 static
-void xfindBestFilterVarPred(alf_ctx_t *Enc_ALF, double ySym[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t ESym[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
+void xfindBestFilterVarPred(alf_ctx_t *Enc_ALF, double ySym[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int ESym[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF],
                             double *pixAcc, int filterCoeffSym[][ALF_MAX_NUM_COEF], int *filters_per_fr_best, int varIndTab[], double lambda_val, int numMaxFilters)
 {
     int filterCoeffSymQuant[NO_VAR_BINS][ALF_MAX_NUM_COEF];
@@ -1816,7 +2261,7 @@ void deriveFilterInfo(alf_ctx_t *Enc_ALF, ALFParam *alfPictureParam, AlfCorrData
  * Input:
  *    alfPictureParam: The ALF parameter
  *              apsId: The ALF parameter index in the buffer
- *       isNewApsSent£ºThe New flag index
+ *       isNewApsSentï¼šThe New flag index
  *       lambda      : The lambda value in the ALF-RD decision
  * Return:
  * ---------------------------------------------------------------------------
@@ -1889,7 +2334,7 @@ int alf_get_buffer_size(const xavs2_param_t *param)
  */
 void alf_init_buffer(xavs2_t *h, uint8_t *mem_base)
 {
-    // Ï£¶û²®ÌØÉ¨ÃèË³Ðò
+    // å¸Œå°”ä¼¯ç‰¹æ‰«æé¡ºåº
     static const uint8_t regionTable[NO_VAR_BINS] = {
         0, 1, 4, 5, 15, 2, 3, 6, 14, 11, 10, 7, 13, 12, 9, 8
     }
diff --git a/source/encoder/encoder.c b/source/encoder/encoder.c
index d5b3890..bf739bc 100644
--- a/source/encoder/encoder.c
+++ b/source/encoder/encoder.c
@@ -97,6 +97,7 @@ extern double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH];
 static ALWAYS_INLINE
 void qsfd_calculate_threshold_of_a_frame(xavs2_t *h)
 {
+    double tab_qsfd_thres[MAX_QP + (h->param->sample_bit_depth - 8) * 8][2][CTU_DEPTH];
     assert(sizeof(h->thres_qsfd_cu) == sizeof(tab_qsfd_thres[0]));
 
     memcpy(h->thres_qsfd_cu, tab_qsfd_thres[h->i_qp], sizeof(h->thres_qsfd_cu));
@@ -187,8 +188,8 @@ void encoder_output_frame_bitstream(xavs2_handler_t *h_mgr, xavs2_frame_t *frame
  */
 void encoder_fetch_one_encoded_frame(xavs2_handler_t *h_mgr, xavs2_outpacket_t *packet, int is_flush)
 {
-    int num_encoding_frames = h_mgr->num_encode - h_mgr->num_output;  // ÕýÔÚ±àÂëÖ¡Êý
-    int num_frames_threads  = h_mgr->i_frm_threads;      // ²¢ÐÐÖ¡Êý
+    int num_encoding_frames = h_mgr->num_encode - h_mgr->num_output;  // æ­£åœ¨ç¼–ç å¸§æ•°
+    int num_frames_threads  = h_mgr->i_frm_threads;      // å¹¶è¡Œå¸§æ•°
 
     /* clear packet data */
     packet->len          = 0;
@@ -695,7 +696,7 @@ static void *encoder_aec_encode_one_frame(xavs2_t *h)
             xavs2_lcu_terminat_bit_write(p_aec, lcu_xy == slice->i_last_lcu_xy);
         }
 
-        /* ½ö¿¼ÂÇLCUÐÐ¼¶µÄSlice»®·Ö·½Ê½ */
+        /* ä»…è€ƒè™‘LCUè¡Œçº§çš„Sliceåˆ’åˆ†æ–¹å¼ */
         if (lcu_xy >= slice->i_last_lcu_xy) {
             int bs_len;
             /* slice done */
@@ -895,7 +896,7 @@ static void encoder_decide_level_id(xavs2_param_t *param)
 {
     const int tab_level_restriction[][5] = {
         /* LevelID, MaxWidth, MaxHeight, MaxFps, MaxKBps */
-        { 0x00, 8192, 8192,   0,      0 },  // ½ûÖ¹
+        { 0x00, 8192, 8192,   0,      0 },  // ç¦æ­¢
         { 0x10,  352,  288,  15,   1500 },  // 2.0.15
         { 0x12,  352,  288,  30,   2000 },  // 2.0.30
         { 0x14,  352,  288,  60,   2500 },  // 2.0.60
@@ -919,14 +920,14 @@ static void encoder_decide_level_id(xavs2_param_t *param)
         { 0x66, 8192, 4608,  60, 480000 },  // 10.2.60
         { 0x68, 8192, 4608, 120, 240000 },  // 10.0.120
         { 0x6A, 8192, 4608, 120, 800000 },  // 10.2.120
-        { 0x00, 16384, 8192, 120, 8000000 },  // ½ûÖ¹
+        { 0x00, 16384, 8192, 120, 8000000 },  // ç¦æ­¢
     };
 
     int i = 1;
     int i_last_level = 0;
 
     for (; tab_level_restriction[i][4] != 0;) {
-        /* Î´¿ªÆôÂë¿ØÊ±£¬ÉèÖÃÎª×î´ó */
+        /* æœªå¼€å¯ç æŽ§æ—¶ï¼Œè®¾ç½®ä¸ºæœ€å¤§ */
         if (param->i_rc_method == 0 &&
             param->org_width <= tab_level_restriction[i_last_level][1] &&
             param->org_height <= tab_level_restriction[i_last_level][2] &&
@@ -934,16 +935,16 @@ static void encoder_decide_level_id(xavs2_param_t *param)
             param->org_height <= tab_level_restriction[i][2] &&
             tab_level_restriction[i_last_level][1] < tab_level_restriction[i][1] &&
             tab_level_restriction[i_last_level][2] < tab_level_restriction[i][2]) {
-            /* ÂëÂÊ¿ØÖÆÎ´¿ªÆôÊ±£¬Ñ¡ÔñÂú×ãÌõ¼þµÄ·Ö±æÂÊÏÂµÄ×î¸ßµµ */
+            /* ç çŽ‡æŽ§åˆ¶æœªå¼€å¯æ—¶ï¼Œé€‰æ‹©æ»¡è¶³æ¡ä»¶çš„åˆ†è¾¨çŽ‡ä¸‹çš„æœ€é«˜æ¡£ */
             i = i_last_level;
             break;
         }
-        /* ·Ö±æÂÊ¡¢Ö¡ÂÊ·ûºÏÒªÇó */
+        /* åˆ†è¾¨çŽ‡ã€å¸§çŽ‡ç¬¦åˆè¦æ±‚ */
         if (param->org_width <= tab_level_restriction[i][1] &&
             param->org_height <= tab_level_restriction[i][2] &&
             param->frame_rate <= tab_level_restriction[i][3]) {
             i_last_level = i;
-            /* ±ÈÌØÂÊÒÑÉè¶¨£¬¿É¸ù¾Ý×î´óÂëÂÊÉèÖÃLevelID */
+            /* æ¯”ç‰¹çŽ‡å·²è®¾å®šï¼Œå¯æ ¹æ®æœ€å¤§ç çŽ‡è®¾ç½®LevelID */
             if (param->i_rc_method != 0 &&
                 param->i_target_bitrate * 1.5 <= tab_level_restriction[i][4] * 1000 &&
                 param->bitrate_upper <= tab_level_restriction[i][4] * 1000) {
@@ -1001,8 +1002,8 @@ int encoder_check_parameters(xavs2_param_t *param)
         return -1;
     }
 
-    /* ¶àSliceÏÂ²»ÄÜ¿ªÆô cross slice loop filter£¬»áÓ°Ïì²¢ÐÐÐ§ÂÊ
-     * TODO: ºóÐø¿ÉÖ§³Ö */
+    /* å¤šSliceä¸‹ä¸èƒ½å¼€å¯ cross slice loop filterï¼Œä¼šå½±å“å¹¶è¡Œæ•ˆçŽ‡
+     * TODO: åŽç»­å¯æ”¯æŒ */
     if (param->slice_num > 1 && param->b_cross_slice_loop_filter != FALSE) {
         xavs2_log(NULL, XAVS2_LOG_WARNING, "Un-supported cross slice loop filter, forcing not filtering\n");
         param->b_cross_slice_loop_filter = FALSE;
@@ -1059,7 +1060,7 @@ int encoder_check_parameters(xavs2_param_t *param)
     }
 
     /* check bit depth */
-    if (param->profile_id != MAIN_PROFILE) {
+    if (param->profile_id != MAIN_PROFILE && param->sample_bit_depth == 8) {
         xavs2_log(NULL, XAVS2_LOG_ERROR, "Not Supported profile \"%d\", HIGH_BIT_DEPTH macro haven`t turn on!\n",
                   param->profile_id);
         return -1;
@@ -1111,8 +1112,10 @@ int encoder_check_parameters(xavs2_param_t *param)
         }
     }
 
+    int max_qp = MAX_QP + (param->sample_bit_depth - 8) * 8;
+
     /* check QP */
-    if (param->i_initial_qp > MAX_QP || param->i_initial_qp < MIN_QP) {
+    if (param->i_initial_qp > max_qp || param->i_initial_qp < MIN_QP) {
         xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter quant_0, check configuration file\n");
         return -1;
     }
@@ -1285,18 +1288,20 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
     size_extra_frame_buffer = (param->enable_tdrdo + param->enable_sao + param->enable_alf) * xavs2_frame_buffer_size(param, FT_TEMP);
 
     /* compute the space size and alloc buffer */
+    if (param->input_sample_bit_depth == 8) {
     mem_size = sizeof(xavs2_t)                       +  /* xavs2_t */
                sizeof(nal_t)   * (MAX_SLICES + 6)    +  /* all nal units */
                sizeof(uint8_t) * XAVS2_BS_HEAD_LEN   +  /* bitstream buffer (frame header only) */
                sizeof(uint8_t) * bs_size             +  /* bitstream buffer for all slices */
                sizeof(slice_t) * MAX_SLICES          +  /* slice array */
-               sizeof(pel_t)   * (frame_w * 2) * num_slices + /* buffer for intra_border */
+               sizeof(pel8_t)   * (frame_w * 2) * num_slices + /* buffer for intra_border */
                sizeof(uint8_t) * w_in_scu * 32 * num_slices + /* buffer for edge filter flag (of one LCU row) */
                sizeof(int8_t)  * ipm_size      * num_slices + /* intra prediction mode buffer */
                sizeof(int8_t)  * size_4x4            +  /* inter prediction direction */
                sizeof(int8_t)  * size_4x4 * 2        +  /* reference frames */
                sizeof(mv_t)    * size_4x4 * 2        +  /* reference motion vectors */
                CACHE_LINE_SIZE * (MAX_SLICES + 32);
+
     mem_size +=
         qpel_frame_size * 3 * sizeof(mct_t)   +  /* temporary buffer for 1/4 interpolation: a,1,b */
         xavs2_me_get_buf_size(param)          +  /* buffers in me module */
@@ -1315,7 +1320,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
 
     /* alloc memory space */
     mem_size = ((mem_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
-    CHECKED_MALLOC(mem_base, uint8_t *, mem_size);
+    CHECKED_MALLOC8(mem_base, uint8_t *, mem_size);
 
     /* assign handle pointer of the xavs2 encoder */
     h = (xavs2_t *)mem_base;
@@ -1390,14 +1395,14 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
         ALIGN_POINTER(mem_base);    /* align pointer */
 
         /* assign pointer to intra_border buffer */
-        p_slice->slice_intra_border[0] = (pel_t *)mem_base;
-        mem_base          += h->i_width * sizeof(pel_t);
+        p_slice->slice_intra_border8[0] = (pel8_t *)mem_base;
+        mem_base          += h->i_width * sizeof(pel8_t);
         ALIGN_POINTER(mem_base);
-        p_slice->slice_intra_border[1] = (pel_t *)mem_base;
-        mem_base          += (h->i_width / 2) * sizeof(pel_t);
+        p_slice->slice_intra_border8[1] = (pel8_t *)mem_base;
+        mem_base          += (h->i_width / 2) * sizeof(pel8_t);
         ALIGN_POINTER(mem_base);
-        p_slice->slice_intra_border[2] = (pel_t *)mem_base;
-        mem_base          += (h->i_width / 2) * sizeof(pel_t);
+        p_slice->slice_intra_border8[2] = (pel8_t *)mem_base;
+        mem_base          += (h->i_width / 2) * sizeof(pel8_t);
         ALIGN_POINTER(mem_base);
 
         /* buffer for edge filter flag (of one LCU row) */
@@ -1408,7 +1413,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
         ALIGN_POINTER(mem_base);
     }
 
-    slice_init_bufer(h, h->slices[0]);
+    slice_init_bufer8(h, h->slices[0]);
 
     /* -------------------------------------------------------------
      *      fenc                fdec
@@ -1421,14 +1426,14 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
      */
 
     /* assign pointers for p_fenc (Y/U/V pointers) */
-    h->lcu.p_fenc[0] = h->lcu.fenc_buf;
-    h->lcu.p_fenc[1] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE;
-    h->lcu.p_fenc[2] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2);
+    h->lcu.p_fenc8[0] = h->lcu.fenc_buf8;
+    h->lcu.p_fenc8[1] = h->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE;
+    h->lcu.p_fenc8[2] = h->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2);
 
     /* assign pointers for p_fdec (Y/U/V pointers) */
-    h->lcu.p_fdec[0] = h->lcu.fdec_buf;
-    h->lcu.p_fdec[1] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE;
-    h->lcu.p_fdec[2] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2);
+    h->lcu.p_fdec8[0] = h->lcu.fdec_buf8;
+    h->lcu.p_fdec8[1] = h->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE;
+    h->lcu.p_fdec8[2] = h->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2);
 
     /* slice index of CTUs */
     h->lcu_slice_idx = (int8_t *)mem_base;
@@ -1512,11 +1517,11 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
         mem_base  += sizeof(lcu_info_t) * w_in_lcu;
 
         if (xavs2_thread_mutex_init(&row->mutex, NULL)) {
-            goto fail;
+            goto fail8;
         }
 
         if (xavs2_thread_cond_init(&row->cond, NULL)) {
-            goto fail;
+            goto fail8;
         }
     }
 
@@ -1573,7 +1578,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
 
     if ((uintptr_t)(h) + mem_size < (uintptr_t)(mem_base)) {
         /* malloc size allocation error: no enough memory */
-        goto fail;
+        goto fail8;
     }
     /* -------------------------------------------------------------
      * init other properties/modules for xavs2 encoder
@@ -1591,8 +1596,319 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en
 
     return h;
 
-fail:
+fail8:
+    return NULL;
+    } else {
+    mem_size = sizeof(xavs2_t)                       +  /* xavs2_t */
+               sizeof(nal_t)   * (MAX_SLICES + 6)    +  /* all nal units */
+               sizeof(uint8_t) * XAVS2_BS_HEAD_LEN   +  /* bitstream buffer (frame header only) */
+               sizeof(uint8_t) * bs_size             +  /* bitstream buffer for all slices */
+               sizeof(slice_t) * MAX_SLICES          +  /* slice array */
+               sizeof(pel10_t)   * (frame_w * 2) * num_slices + /* buffer for intra_border */
+               sizeof(uint8_t) * w_in_scu * 32 * num_slices + /* buffer for edge filter flag (of one LCU row) */
+               sizeof(int8_t)  * ipm_size      * num_slices + /* intra prediction mode buffer */
+               sizeof(int8_t)  * size_4x4            +  /* inter prediction direction */
+               sizeof(int8_t)  * size_4x4 * 2        +  /* reference frames */
+               sizeof(mv_t)    * size_4x4 * 2        +  /* reference motion vectors */
+               CACHE_LINE_SIZE * (MAX_SLICES + 32);
+
+    mem_size +=
+        qpel_frame_size * 3 * sizeof(mct_t)   +  /* temporary buffer for 1/4 interpolation: a,1,b */
+        xavs2_me_get_buf_size(param)          +  /* buffers in me module */
+        info_size                             +  /* the frame info structure */
+        frame_size_in_scu * sizeof(cu_info_t) +  /* CU data */
+        num_me_bytes                          +  /* Motion Estimation */
+        w_in_lcu * h_in_lcu * sizeof(int8_t)  +  /* CTU slice index */
+        size_extra_frame_buffer               +  /* extra frame buffer: TDRDO, SAO, ALF */
+
+        size_sao_stats + CACHE_LINE_SIZE      +  /* SAO stat data */
+        size_sao_param + CACHE_LINE_SIZE      +  /* SAO parameters */
+        size_sao_onoff + CACHE_LINE_SIZE      +  /* SAO on/off number of LCU row */
+
+        size_alf + CACHE_LINE_SIZE            +  /* ALF encoder contexts */
+        CACHE_LINE_SIZE * 30;                    /* used for align buffer */
+
+    /* alloc memory space */
+    mem_size = ((mem_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
+    CHECKED_MALLOC10(mem_base, uint8_t *, mem_size);
+
+    /* assign handle pointer of the xavs2 encoder */
+    h = (xavs2_t *)mem_base;
+    memset(h, 0, sizeof(xavs2_t));
+    mem_base += sizeof(xavs2_t);
+    ALIGN_POINTER(mem_base);          /* align pointer */
+
+    /* init log module */
+    h->module_log.i_log_level = param->i_log_level;
+    sprintf(h->module_log.module_name, "Enc[%2d] %06llx", idx_frm_encoder, (uintptr_t)(h));
+
+    /* copy the input parameters */
+    h->param = param;
+
+    /* const properties */
+    h->i_width           = frame_w;
+    h->i_height          = frame_h;
+    h->i_width_in_lcu    = w_in_lcu;
+    h->i_height_in_lcu   = h_in_lcu;
+    h->i_width_in_mincu  = w_in_scu;
+    h->i_height_in_mincu = h_in_scu;
+    h->i_width_in_minpu  = w_in_4x4;
+    h->i_height_in_minpu = h_in_4x4;
+
+    h->framerate         = h->param->frame_rate;
+
+    h->i_lcu_level       = h->param->lcu_bit_level;
+    h->i_scu_level       = h->param->scu_bit_level;
+    h->i_chroma_v_shift  = h->param->chroma_format == CHROMA_420;
+    h->i_max_ref         = h->param->num_max_ref;
+    h->b_progressive     = (bool_t)h->param->progressive_frame;
+    h->b_field_sequence  = (h->param->InterlaceCodingOption == FIELD_CODING);
+
+    /* set table which indicates numbers of intra prediction modes for RDO */
+    for (i = 0; i < MAX_CU_SIZE_IN_BIT; i++) {
+        h->tab_num_intra_rdo[i] = 1;                 /* this will later be set according to the preset level */
+    }
+    h->num_rdo_intra_chroma = NUM_INTRA_MODE_CHROMA;
+
+    /* -------------------------------------------------------------
+     * assign buffer pointers of xavs2 encoder
+     */
+
+    /* point to all nal units */
+    h->p_nal  = (nal_t *)mem_base;
+    mem_base += sizeof(nal_t) * (MAX_SLICES + 6);
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* bitstream buffer (frame header) */
+    h->p_bs_buf_header = mem_base;
+    h->i_bs_buf_header = sizeof(uint8_t) * XAVS2_BS_HEAD_LEN;
+    mem_base          += sizeof(uint8_t) * XAVS2_BS_HEAD_LEN;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* bitstream buffer for all slices */
+    h->p_bs_buf_slice = mem_base;
+    h->i_bs_buf_slice = sizeof(uint8_t) * bs_size;
+    mem_base         += sizeof(uint8_t) * bs_size;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* slice array */
+    for (i = 0; i < num_slices; i++) {
+        slice_t *p_slice = (slice_t *)mem_base;
+        h->slices[i] = p_slice;
+        mem_base    += sizeof(slice_t);
+        ALIGN_POINTER(mem_base);    /* align pointer */
+
+        /* intra prediction mode buffer */
+        p_slice->slice_ipredmode  = (int8_t *)mem_base;
+        mem_base                 += sizeof(int8_t) * ipm_size;
+        p_slice->slice_ipredmode += (h->i_width_in_minpu + 16) + 16;
+        ALIGN_POINTER(mem_base);    /* align pointer */
+
+        /* assign pointer to intra_border buffer */
+        p_slice->slice_intra_border10[0] = (pel10_t *)mem_base;
+        mem_base          += h->i_width * sizeof(pel10_t);
+        ALIGN_POINTER(mem_base);
+        p_slice->slice_intra_border10[1] = (pel10_t *)mem_base;
+        mem_base          += (h->i_width / 2) * sizeof(pel10_t);
+        ALIGN_POINTER(mem_base);
+        p_slice->slice_intra_border10[2] = (pel10_t *)mem_base;
+        mem_base          += (h->i_width / 2) * sizeof(pel10_t);
+        ALIGN_POINTER(mem_base);
+
+        /* buffer for edge filter flag (of one LCU row) */
+        p_slice->slice_deblock_flag[0] = (uint8_t *)mem_base;
+        mem_base            += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t);
+        p_slice->slice_deblock_flag[1] = (uint8_t *)mem_base;
+        mem_base            += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t);
+        ALIGN_POINTER(mem_base);
+    }
+
+    slice_init_bufer10(h, h->slices[0]);
+
+    /* -------------------------------------------------------------
+     *      fenc                fdec
+     *      Y Y Y Y             Y Y Y Y
+     *      Y Y Y Y             Y Y Y Y
+     *      Y Y Y Y             Y Y Y Y
+     *      Y Y Y Y             Y Y Y Y
+     *      U U V V             U U V V
+     *      U U V V             U U V V
+     */
+
+    /* assign pointers for p_fenc (Y/U/V pointers) */
+    h->lcu.p_fenc10[0] = h->lcu.fenc_buf10;
+    h->lcu.p_fenc10[1] = h->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE;
+    h->lcu.p_fenc10[2] = h->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2);
+
+    /* assign pointers for p_fdec (Y/U/V pointers) */
+    h->lcu.p_fdec10[0] = h->lcu.fdec_buf10;
+    h->lcu.p_fdec10[1] = h->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE;
+    h->lcu.p_fdec10[2] = h->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2);
+
+    /* slice index of CTUs */
+    h->lcu_slice_idx = (int8_t *)mem_base;
+    mem_base += w_in_lcu * h_in_lcu * sizeof(int8_t);
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* inter prediction mode */
+    h->dir_pred = (int8_t *)mem_base;
+    mem_base += sizeof(int8_t) * size_4x4;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* reference frames */
+    h->fwd_1st_ref = (int8_t *)mem_base;
+    mem_base      += sizeof(int8_t) * size_4x4;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+    h->bwd_2nd_ref = (int8_t *)mem_base;
+    mem_base      += sizeof(int8_t) * size_4x4;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* reference motion vectors */
+    h->fwd_1st_mv = (mv_t *)mem_base;
+    mem_base     += sizeof(mv_t) * size_4x4;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+    h->bwd_2nd_mv = (mv_t *)mem_base;
+    mem_base     += sizeof(mv_t) * size_4x4;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* temporary buffer for 1/4 interpolation: a,1,b, alone buffer */
+    h->img4Y_tmp[0] = (mct_t *)mem_base;
+    h->img4Y_tmp[1] = h->img4Y_tmp[0] + qpel_frame_size;
+    h->img4Y_tmp[2] = h->img4Y_tmp[0] + qpel_frame_size * 2;
+    mem_base       += qpel_frame_size * 3 * sizeof(mct_t);
+    ALIGN_POINTER(mem_base);
+
+    /* SAO data */
+    h->sao_stat_datas = (SAOStatData (*)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES])mem_base;
+    memset(h->sao_stat_datas[0], 0, size_sao_stats);
+    mem_base += size_sao_stats;
+    ALIGN_POINTER(mem_base);
+
+    h->sao_blk_params = (SAOBlkParam (*)[NUM_SAO_COMPONENTS])mem_base;
+    memset(h->sao_blk_params[0], 0, size_sao_param);
+    mem_base += size_sao_param;
+    ALIGN_POINTER(mem_base);
+
+    h->num_sao_lcu_off = (int (*)[NUM_SAO_COMPONENTS])mem_base;
+    memset(h->num_sao_lcu_off[0], 0, size_sao_onoff);
+    mem_base += size_sao_onoff;
+    ALIGN_POINTER(mem_base);
+
+
+    /* init memory space in me module */
+    xavs2_me_init(h, &mem_base);
+
+    /* allocate frame_info_t (one for each frame context) */
+    h->frameinfo = (frame_info_t *)mem_base;
+    mem_base    += sizeof(frame_info_t);
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    h->frameinfo->rows = (row_info_t *)mem_base;
+    mem_base          += sizeof(row_info_t) * h_in_lcu;
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* set available tables */
+    set_available_tables(h);
+
+    /* assign pointers for all coding tree units */
+    h->lcu.p_ctu    = &h->lcu.all_cu[0];
+    h->lcu.i_scu_xy = 1;        // borrowed
+    build_coding_tree(h, h->lcu.p_ctu, 0, h->i_lcu_level, 0, 0);
+    h->lcu.i_scu_xy = 0;        // reset
+
+    /* set row info */
+    for (i = 0; i < h_in_lcu; i++) {
+        row_info_t *row = &h->frameinfo->rows[i];
+
+        row->h     = 0;
+        row->row   = i;
+        row->coded = -1;
+        row->lcus  = (lcu_info_t *)mem_base;
+        mem_base  += sizeof(lcu_info_t) * w_in_lcu;
+
+        if (xavs2_thread_mutex_init(&row->mutex, NULL)) {
+            goto fail10;
+        }
+
+        if (xavs2_thread_cond_init(&row->cond, NULL)) {
+            goto fail10;
+        }
+    }
+
+    /* check memory size */
+    ALIGN_POINTER(mem_base);    /* align pointer */
+
+    /* -------------------------------------------------------------
+     * allocate other alone spaces for xavs2 encoder
+     */
+
+    h->cu_info = (cu_info_t *)mem_base;
+    mem_base  += frame_size_in_scu * sizeof(cu_info_t);
+    ALIGN_POINTER(mem_base);
+
+    p_cu_info = h->cu_info;
+    for (j = 0; j < h_in_scu; j++) {
+        for (i = 0; i < w_in_scu; i++) {
+            scu_xy++;
+            p_cu_info->i_scu_x = i;
+            p_cu_info->i_scu_y = j;
+            p_cu_info++;
+        }
+    }
+
+    /* motion estimation buffer */
+    h->all_mincost = (dist_t(*)[MAX_INTER_MODES][MAX_REFS])mem_base;
+    mem_base += num_me_bytes;
+    ALIGN_POINTER(mem_base);
+
+    // allocate memory for current frame
+    if (h->param->enable_tdrdo) {
+        h->img_luma_pre = xavs2_frame_new(h, &mem_base, FT_TEMP);
+        ALIGN_POINTER(mem_base);
+    } else {
+        h->img_luma_pre = NULL;
+    }
+
+    if (h->param->enable_sao) {
+        h->img_sao = xavs2_frame_new(h, &mem_base, FT_TEMP);
+        ALIGN_POINTER(mem_base);
+    } else {
+        h->img_sao = NULL;
+    }
+
+    if (h->param->enable_alf) {
+        h->img_alf = xavs2_frame_new(h, &mem_base, FT_TEMP);
+        ALIGN_POINTER(mem_base);
+        alf_init_buffer(h, mem_base);
+        mem_base += size_alf;
+        ALIGN_POINTER(mem_base);
+    } else {
+        h->img_alf = NULL;
+    }
+
+    if ((uintptr_t)(h) + mem_size < (uintptr_t)(mem_base)) {
+        /* malloc size allocation error: no enough memory */
+        goto fail10;
+    }
+    /* -------------------------------------------------------------
+     * init other properties/modules for xavs2 encoder
+     */
+
+    /* init all slices */
+    xavs2_slices_init(h);
+
+#if ENABLE_WQUANT
+    /* adaptive frequency weighting quantization */
+    if (h->param->enable_wquant) {
+        xavs2_wq_init_seq_quant_param(h);
+    }
+#endif
+
+    return h;
+
+fail10:
     return NULL;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1632,8 +1948,9 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr)
 
     /* -------------------------------------------------------------
      * build lcu row encoding contexts */
+    if (h->param->input_sample_bit_depth == 8) {
     if (h_mgr->num_row_contexts > 1) {
-        CHECKED_MALLOC(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t));
+        CHECKED_MALLOC8(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t));
 
         for (i = 0; i < h_mgr->num_row_contexts; i++) {
             xavs2_t *h_row_coder = &h_mgr->row_contexts[i];
@@ -1654,14 +1971,14 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr)
             h_row_coder->lcu.i_scu_xy  = 0;     // reset
 
             /* assign pointers for p_fenc (Y/U/V pointers) */
-            h_row_coder->lcu.p_fenc[0] = h_row_coder->lcu.fenc_buf;
-            h_row_coder->lcu.p_fenc[1] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE;
-            h_row_coder->lcu.p_fenc[2] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2;
+            h_row_coder->lcu.p_fenc8[0] = h_row_coder->lcu.fenc_buf8;
+            h_row_coder->lcu.p_fenc8[1] = h_row_coder->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE;
+            h_row_coder->lcu.p_fenc8[2] = h_row_coder->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2;
 
             /* assign pointers for p_fdec (Y/U/V pointers) */
-            h_row_coder->lcu.p_fdec[0] = h_row_coder->lcu.fdec_buf;
-            h_row_coder->lcu.p_fdec[1] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE;
-            h_row_coder->lcu.p_fdec[2] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2;
+            h_row_coder->lcu.p_fdec8[0] = h_row_coder->lcu.fdec_buf8;
+            h_row_coder->lcu.p_fdec8[1] = h_row_coder->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE;
+            h_row_coder->lcu.p_fdec8[2] = h_row_coder->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2;
         }
     }
 
@@ -1670,7 +1987,7 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr)
     h_mgr->frm_contexts[0] = h; /* context 0 is the main encoder handle */
     for (i = 1; i < h_mgr->i_frm_threads; i++) {
         if ((h_mgr->frm_contexts[i] = encoder_create_frame_context(h->param, i)) == 0) {
-            goto fail;
+            goto fail8;
         }
 
         memcpy(&h_mgr->frm_contexts[i]->communal_vars_1, &h->communal_vars_1,
@@ -1679,8 +1996,59 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr)
 
     return 0;
 
-fail:
+fail8:
+    return -1;
+    } else {
+    if (h_mgr->num_row_contexts > 1) {
+        CHECKED_MALLOC10(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t));
+
+        for (i = 0; i < h_mgr->num_row_contexts; i++) {
+            xavs2_t *h_row_coder = &h_mgr->row_contexts[i];
+
+            memcpy(&h_row_coder->communal_vars_1, &h->communal_vars_1,
+                   (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1);
+
+            /* identify ourself */
+            h_row_coder->task_type = XAVS2_TASK_ROW;
+
+            /* we are free */
+            h_row_coder->i_aec_frm = -1;
+
+            /* assign pointers for all coding tree units */
+            h_row_coder->lcu.p_ctu     = &h_row_coder->lcu.all_cu[0];
+            h_row_coder->lcu.i_scu_xy  = 1;     // borrowed
+            build_coding_tree(h_row_coder, h_row_coder->lcu.p_ctu, 0, h_row_coder->i_lcu_level, 0, 0);
+            h_row_coder->lcu.i_scu_xy  = 0;     // reset
+
+            /* assign pointers for p_fenc (Y/U/V pointers) */
+            h_row_coder->lcu.p_fenc10[0] = h_row_coder->lcu.fenc_buf10;
+            h_row_coder->lcu.p_fenc10[1] = h_row_coder->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE;
+            h_row_coder->lcu.p_fenc10[2] = h_row_coder->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2;
+
+            /* assign pointers for p_fdec (Y/U/V pointers) */
+            h_row_coder->lcu.p_fdec10[0] = h_row_coder->lcu.fdec_buf10;
+            h_row_coder->lcu.p_fdec10[1] = h_row_coder->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE;
+            h_row_coder->lcu.p_fdec10[2] = h_row_coder->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2;
+        }
+    }
+
+    /* -------------------------------------------------------------
+     * build frame encoding contexts */
+    h_mgr->frm_contexts[0] = h; /* context 0 is the main encoder handle */
+    for (i = 1; i < h_mgr->i_frm_threads; i++) {
+        if ((h_mgr->frm_contexts[i] = encoder_create_frame_context(h->param, i)) == 0) {
+            goto fail10;
+        }
+
+        memcpy(&h_mgr->frm_contexts[i]->communal_vars_1, &h->communal_vars_1,
+               (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1);
+    }
+
+    return 0;
+
+fail10:
     return -1;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1837,12 +2205,22 @@ static void init_decoding_frame(xavs2_t *h)
 static void encoder_init_func_handles(xavs2_t *h)
 {
     /* set some function handles according option or preset level */
+    if (h->param->input_sample_bit_depth == 8) {
     if (h->param->enable_hadamard) {
-        g_funcs.pixf.intra_cmp = g_funcs.pixf.satd;
-        g_funcs.pixf.fpel_cmp  = g_funcs.pixf.satd;
+        g_funcs.pixf.intra8_cmp = g_funcs.pixf.satd8;
+        g_funcs.pixf.fpel8_cmp  = g_funcs.pixf.satd8;
     } else {
-        g_funcs.pixf.intra_cmp = g_funcs.pixf.sad;
-        g_funcs.pixf.fpel_cmp  = g_funcs.pixf.sad;
+        g_funcs.pixf.intra8_cmp = g_funcs.pixf.sad8;
+        g_funcs.pixf.fpel8_cmp  = g_funcs.pixf.sad8;
+    }
+    } else {
+    if (h->param->enable_hadamard) {
+        g_funcs.pixf.intra10_cmp = g_funcs.pixf.satd10;
+        g_funcs.pixf.fpel10_cmp  = g_funcs.pixf.satd10;
+    } else {
+        g_funcs.pixf.intra10_cmp = g_funcs.pixf.sad10;
+        g_funcs.pixf.fpel10_cmp  = g_funcs.pixf.sad10;
+    }
     }
 }
 
@@ -1992,7 +2370,7 @@ void xavs2e_frame_coding_init(xavs2_t *h)
     /* encoding begin ----------------------------------------------
      */
 
-    /* Ö¡¼¶ÆäËû²ÎÊý³õÊ¼»¯ */
+    /* å¸§çº§å…¶ä»–å‚æ•°åˆå§‹åŒ– */
     if (IS_ALG_ENABLE(OPT_CU_QSFD)) {
         qsfd_calculate_threshold_of_a_frame(h);
     }
@@ -2038,7 +2416,7 @@ void *xavs2e_encode_one_frame(void *arg)
 
     /* start AEC frame coding */
     if (h->h_top->threadpool_aec != NULL && !h->param->enable_alf) {
-        xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0);
+        xavs2_threadpool_run(h->h_top->threadpool_aec, (void * (*)(void *)) encoder_aec_encode_one_frame, h, 0);
     }
 
     /* (3) encode all LCU rows in current frame ---------------------------
@@ -2051,44 +2429,44 @@ void *xavs2e_encode_one_frame(void *arg)
 
         h->i_slice_index = g_slice_lcu_row_order[i].slice_idx;
 
-        /* ÊÇ·ñÐèÒª¶îÍâ´¦ÀíSlice±ß½ç */
+        /* æ˜¯å¦éœ€è¦é¢å¤–å¤„ç†Sliceè¾¹ç•Œ */
         row->b_top_slice_border  = 0;
         row->b_down_slice_border = 0;
 
-        /* µ±Ç°Ö¡ÄÚµÄÒÀÀµÐÐ */
+        /* å½“å‰å¸§å†…çš„ä¾èµ–è¡Œ */
         if (row_type) {
             last_row = &rows[lcu_y - 1];
             row->b_down_slice_border = (row_type == 2 && lcu_y != h->i_height_in_lcu - 1);
         } else {
-            xavs2_slice_write_start(h);  /* SliceµÄµÚÒ»ÐÐ£¬³õÊ¼»¯ */
+            xavs2_slice_write_start(h);  /* Sliceçš„ç¬¬ä¸€è¡Œï¼Œåˆå§‹åŒ– */
             last_row = NULL;
             row->b_top_slice_border = (lcu_y > 0);
         }
 
-        /* µÈ´ý²Î¿¼Ö¡ÖÐÒÀÀµµÄÐÐ±àÂëÍê±Ï */
+        /* ç­‰å¾…å‚è€ƒå¸§ä¸­ä¾èµ–çš„è¡Œç¼–ç å®Œæ¯• */
         xavs2e_inter_sync(h, lcu_y, 0);
 
         /* encode one lcu row */
         if (enable_wpp && i != h->i_height_in_lcu - 1) {
-            /* 1, ·ÖÅäÒ»¸öÐÐ¼¶µÄÏß³Ì½øÐÐ±àÂë */
+            /* 1, åˆ†é…ä¸€ä¸ªè¡Œçº§çš„çº¿ç¨‹è¿›è¡Œç¼–ç  */
             if ((row->h = xavs2e_alloc_row_task(h)) == NULL) {
                 return NULL;
             }
 
-            /* 2, ¼ì²éµ±Ç°ÐÐÊÇ·ñÓ¦Á¢¿ÌÆô¶¯£»
-             *    ¹æÔòÎªµÈ´ýÉÏÒ»ÐÐÖÁÉÙÍê³ÉÁ½¸öLCU²ÅÆô¶¯Ïß³Ì£¬ÕâÀïÖÁÉÙµÈ´ý1¸ö
+            /* 2, æ£€æŸ¥å½“å‰è¡Œæ˜¯å¦åº”ç«‹åˆ»å¯åŠ¨ï¼›
+             *    è§„åˆ™ä¸ºç­‰å¾…ä¸Šä¸€è¡Œè‡³å°‘å®Œæˆä¸¤ä¸ªLCUæ‰å¯åŠ¨çº¿ç¨‹ï¼Œè¿™é‡Œè‡³å°‘ç­‰å¾…1ä¸ª
              */
             wait_lcu_row_coded(last_row, 0);
 
-            /* 3, Ê¹ÓÃ¸ÃÐÐ¼¶Ïß³Ì½øÐÐ±àÂë */
+            /* 3, ä½¿ç”¨è¯¥è¡Œçº§çº¿ç¨‹è¿›è¡Œç¼–ç  */
             xavs2_threadpool_run(h->h_top->threadpool_rdo, xavs2_lcu_row_write, row, 0);
         } else {
             row->h = h;
             xavs2_lcu_row_write(row);
         }
 
-        /* ¶ÔSliceµÄ×îºóÒ»ÐÐLCUÀ´Ëµ£¬ÐèÒªºÏ²¢¶à¸öSliceµÄÂëÁ÷
-         * µ«ÔÚRDO½×¶Î£¬²¢²»ÐèÒª */
+        /* å¯¹Sliceçš„æœ€åŽä¸€è¡ŒLCUæ¥è¯´ï¼Œéœ€è¦åˆå¹¶å¤šä¸ªSliceçš„ç æµ
+         * ä½†åœ¨RDOé˜¶æ®µï¼Œå¹¶ä¸éœ€è¦ */
         // if (h->param->slice_num > 1 && row_type == 2) {
         //     nal_merge_slice(h, h->slices[h->i_slice_index]->p_bs_buf, h->i_nal_type, h->i_nal_ref_idc);
         // }
@@ -2107,7 +2485,7 @@ void *xavs2e_encode_one_frame(void *arg)
         }
     }
 
-    /* (5) Í³¼ÆSAOµÄ¿ªÆôºÍ¿ª¹Ø±ÈÂÊ */
+    /* (5) ç»Ÿè®¡SAOçš„å¼€å¯å’Œå¼€å…³æ¯”çŽ‡ */
     if (h->param->enable_sao && (h->slice_sao_on[0] || h->slice_sao_on[1] || h->slice_sao_on[2])) {
         int sao_off_num_y = 0;
         int sao_off_num_u = 0;
@@ -2132,7 +2510,7 @@ void *xavs2e_encode_one_frame(void *arg)
         xavs2_frame_copy_planes(h, h->img_alf, h->fdec);
         xavs2_frame_expand_border_frame(h, h->img_alf);
         alf_filter_one_frame(h);
-        /* ÖØÐÂ¶ÔÖØ¹¹Í¼Ïñ±ß½ç½øÐÐÀ©Õ¹ */
+        /* é‡æ–°å¯¹é‡æž„å›¾åƒè¾¹ç•Œè¿›è¡Œæ‰©å±• */
         if (h->pic_alf_on[0] || h->pic_alf_on[1] || h->pic_alf_on[2]) {
             xavs2_frame_expand_border_frame(h, h->fdec);
         }
@@ -2147,7 +2525,7 @@ void *xavs2e_encode_one_frame(void *arg)
 #endif
 
         if (h->h_top->threadpool_aec != NULL) {
-            xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0);
+            xavs2_threadpool_run(h->h_top->threadpool_aec, (void * (*)(void *)) encoder_aec_encode_one_frame, h, 0);
         }
     }
 
diff --git a/source/encoder/encoder_report.c b/source/encoder/encoder_report.c
index 9873c15..6849e66 100644
--- a/source/encoder/encoder_report.c
+++ b/source/encoder/encoder_report.c
@@ -72,21 +72,22 @@ void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v
     const int inout_shift     = 0;
     uint64_t diff_y, diff_u, diff_v;
 
+    if (h->param->input_sample_bit_depth == 8) {
     /* luma */
-    diff_y = xavs2_pixel_ssd_wxh(&g_funcs.pixf,
-                                 h->fenc->planes[0], h->fenc->i_stride[0],
-                                 h->fdec->planes[0], h->fdec->i_stride[0], i_width, i_height, inout_shift);
+    diff_y = xavs2_pixel_ssd8_wxh(&g_funcs.pixf,
+                                 h->fenc->planes8[0], h->fenc->i_stride[0],
+                                 h->fdec->planes8[0], h->fdec->i_stride[0], i_width, i_height, inout_shift);
 
     /* chroma */
     if (h->param->chroma_format != CHROMA_400) {
         i_width  >>= 1;
         i_height >>= 1;
-        diff_u = xavs2_pixel_ssd_wxh(&g_funcs.pixf,
-                                     h->fenc->planes[1], h->fenc->i_stride[1],
-                                     h->fdec->planes[1], h->fdec->i_stride[1], i_width, i_height, inout_shift);
-        diff_v = xavs2_pixel_ssd_wxh(&g_funcs.pixf,
-                                     h->fenc->planes[2], h->fenc->i_stride[2],
-                                     h->fdec->planes[2], h->fdec->i_stride[2], i_width, i_height, inout_shift);
+        diff_u = xavs2_pixel_ssd8_wxh(&g_funcs.pixf,
+                                     h->fenc->planes8[1], h->fenc->i_stride[1],
+                                     h->fdec->planes8[1], h->fdec->i_stride[1], i_width, i_height, inout_shift);
+        diff_v = xavs2_pixel_ssd8_wxh(&g_funcs.pixf,
+                                     h->fenc->planes8[2], h->fenc->i_stride[2],
+                                     h->fdec->planes8[2], h->fdec->i_stride[2], i_width, i_height, inout_shift);
     } else {
         diff_u = 0;
         diff_v = 0;
@@ -98,6 +99,34 @@ void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v
     *psnr_y = get_psnr_with_ssd(f_max_signal, diff_y);
     *psnr_u = get_psnr_with_ssd(f_max_signal, diff_u * uvformat);
     *psnr_v = get_psnr_with_ssd(f_max_signal, diff_v * uvformat);
+    } else {
+    /* luma */
+    diff_y = xavs2_pixel_ssd10_wxh(&g_funcs.pixf,
+                                 h->fenc->planes10[0], h->fenc->i_stride[0],
+                                 h->fdec->planes10[0], h->fdec->i_stride[0], i_width, i_height, inout_shift);
+
+    /* chroma */
+    if (h->param->chroma_format != CHROMA_400) {
+        i_width  >>= 1;
+        i_height >>= 1;
+        diff_u = xavs2_pixel_ssd10_wxh(&g_funcs.pixf,
+                                     h->fenc->planes10[1], h->fenc->i_stride[1],
+                                     h->fdec->planes10[1], h->fdec->i_stride[1], i_width, i_height, inout_shift);
+        diff_v = xavs2_pixel_ssd10_wxh(&g_funcs.pixf,
+                                     h->fenc->planes10[2], h->fenc->i_stride[2],
+                                     h->fdec->planes10[2], h->fdec->i_stride[2], i_width, i_height, inout_shift);
+    } else {
+        diff_u = 0;
+        diff_v = 0;
+    }
+
+    xavs2_emms();     /* call before using float instructions */
+
+    /* get the PSNR for current frame */
+    *psnr_y = get_psnr_with_ssd(f_max_signal, diff_y);
+    *psnr_u = get_psnr_with_ssd(f_max_signal, diff_u * uvformat);
+    *psnr_v = get_psnr_with_ssd(f_max_signal, diff_v * uvformat);
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -136,12 +165,13 @@ double ssim_calculate_plane(xavs2_t *h, int comp_id)
     double C1 = k_ssim_1 * k_ssim_1 * uiMaxval * uiMaxval;
     double C2 = k_ssim_2 * k_ssim_2 * uiMaxval * uiMaxval;
 
-    pel_t*  pOrg = h->fenc->planes[comp_id];
-    pel_t*  pRec = h->fdec->planes[comp_id];
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t*  pOrg = h->fenc->planes8[comp_id];
+    pel8_t*  pRec = h->fdec->planes8[comp_id];
     // xavs2_log(h, XAVS2_LOG_INFO, "pOrg : %p pRec : %p\n",pOrg,pRec);
 
-    pel_t*  pOrgPel = pOrg;
-    pel_t*  pRecPel = pRec;
+    pel8_t*  pOrgPel = pOrg;
+    pel8_t*  pRecPel = pRec;
 
     for (j = 0; j <= uiHeight - uiWinHeight; j++) {
         for (i = 0; i <= uiWidth - uiWinWidth; i++) {
@@ -191,6 +221,63 @@ double ssim_calculate_plane(xavs2_t *h, int comp_id)
 
     // xavs2_log(h, XAVS2_LOG_INFO,"ssim: %7.4f \n ", dMSSIM / (double)uiNumWin);
     return dMSSIM / (double)uiNumWin;
+    } else {
+    pel10_t*  pOrg = h->fenc->planes10[comp_id];
+    pel10_t*  pRec = h->fdec->planes10[comp_id];
+    // xavs2_log(h, XAVS2_LOG_INFO, "pOrg : %p pRec : %p\n",pOrg,pRec);
+
+    pel10_t*  pOrgPel = pOrg;
+    pel10_t*  pRecPel = pRec;
+
+    for (j = 0; j <= uiHeight - uiWinHeight; j++) {
+        for (i = 0; i <= uiWidth - uiWinWidth; i++) {
+            dLocMeanRef = 0;
+            dLocMeanRec = 0;
+            dLocVarRef = 0;
+            dLocVarRec = 0;
+            dLocCovar = 0;
+            pOrgPel = pOrg + i + iStride1*j;
+            pRecPel = pRec + i + iStride2*j;
+            // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[0] : %d pRecPel[0] : %d\n",pOrgPel[0],pRecPel[0]);
+            // xavs2_log(h, XAVS2_LOG_INFO, "uiWinWidth : %d uiWinHeight : %d\n",uiWinWidth,uiWinHeight);
+
+            for (y = 0; y < uiWinHeight; y++) {
+                for (x = 0; x < uiWinWidth; x++) {
+                    // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[%d] : %d pRecPel[%d] : %d\n",x,pOrgPel[x],x,pRecPel[x]);
+
+                    dLocMeanRef += pOrgPel[x];
+                    dLocMeanRec += pRecPel[x];
+                    dLocVarRef += pOrgPel[x] * pOrgPel[x];
+                    dLocVarRec += pRecPel[x] * pRecPel[x];
+                    dLocCovar += pOrgPel[x] * pRecPel[x];
+
+                }
+                pOrgPel += iStride1;
+                pRecPel += iStride2;
+            }
+
+            dLocMeanRef /= iWinPixel;
+            dLocMeanRec /= iWinPixel;
+            // xavs2_log(h, XAVS2_LOG_INFO, "dLocMeanRef : %7.4f dLocMeanRec : %7.4f \n",dLocMeanRef,dLocMeanRec);
+
+            dLocVarRef = (dLocVarRef - dLocMeanRef * dLocMeanRef * iWinPixel) / iWinPixel;
+            dLocVarRec = (dLocVarRec - dLocMeanRec * dLocMeanRec * iWinPixel) / iWinPixel;
+            dLocCovar = (dLocCovar - dLocMeanRef * dLocMeanRec * iWinPixel) / iWinPixel;
+
+            Num1 = 2.0 * dLocMeanRef * dLocMeanRec + C1;
+            Num2 = 2.0 * dLocCovar + C2;
+            Den1 = dLocMeanRef * dLocMeanRef + dLocMeanRec * dLocMeanRec + C1;
+            Den2 = dLocVarRef + dLocVarRec + C2;
+
+            dLocSSIM = (Num1 * Num2) / (Den1 * Den2);
+
+            dMSSIM += dLocSSIM;
+        }
+    }
+
+    // xavs2_log(h, XAVS2_LOG_INFO,"ssim: %7.4f \n ", dMSSIM / (double)uiNumWin);
+    return dMSSIM / (double)uiNumWin;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -405,8 +492,13 @@ void encoder_show_head_info(xavs2_param_t *param)
     xavs2_log(NULL, XAVS2_LOG_DEBUG, " Total Frames     : %d \n", param->num_frames);
     /* basic parameters */
     xavs2_log(NULL, XAVS2_LOG_INFO, "--------------------------------------------------------------------------------\n");
+    if (param->input_sample_bit_depth == 8) {
+    xavs2_log(NULL, XAVS2_LOG_INFO, " Profile & Level  : 0x%02X-0x%02X, BitDepth: %d/%d, size(pel): %d \n",
+              param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel8_t));
+    } else {
     xavs2_log(NULL, XAVS2_LOG_INFO, " Profile & Level  : 0x%02X-0x%02X, BitDepth: %d/%d, size(pel): %d \n",
-              param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel_t));
+              param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel10_t));
+    }
     xavs2_log(NULL, XAVS2_LOG_INFO, " Video Property   : %dx%d, %.3f Hz (FrameRateCode: %d)\n",
               param->org_width, param->org_height, param->frame_rate, param->frame_rate_code);
 
diff --git a/source/encoder/header.c b/source/encoder/header.c
index 0802c4a..7a84ff7 100644
--- a/source/encoder/header.c
+++ b/source/encoder/header.c
@@ -52,7 +52,7 @@
  */
 static ALWAYS_INLINE int is_valid_qp(xavs2_t *h, int i_qp)
 {
-    int max_qp = MAX_QP;
+    int max_qp = MAX_QP + (h->param->sample_bit_depth - 8) * 8;
     UNUSED_PARAMETER(h);
     return i_qp >= 0 && i_qp <= max_qp;
 }
diff --git a/source/encoder/md_inter.c b/source/encoder/md_inter.c
index 8b652f2..7613150 100644
--- a/source/encoder/md_inter.c
+++ b/source/encoder/md_inter.c
@@ -152,7 +152,7 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo
         }
     }
 
-    /* ÏàÁÚ¿é²»´æÔÚË«ÏòÔ¤²â¿éÊ±£¬Ë«ÏòSkip/DirectÄ£Ê½µÄÌî³ä */
+    /* ç›¸é‚»å—ä¸å­˜åœ¨åŒå‘é¢„æµ‹å—æ—¶ï¼ŒåŒå‘Skip/Directæ¨¡å¼çš„å¡«å…… */
     if (bid_flag == 0 && fwd_flag != 0 && bw_flag != 0) {
         p_cumode->skip_mv_2nd[DS_B_BID] = p_cumode->skip_mv_2nd[DS_B_BWD];
         p_cumode->skip_mv_1st[DS_B_BID] = p_cumode->skip_mv_1st[DS_B_FWD];
@@ -160,16 +160,16 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo
     p_cumode->skip_ref_1st[DS_B_BID] = B_FWD;
     p_cumode->skip_ref_2nd[DS_B_BID] = B_BWD;
 
-    /* ÏàÁÚ¿é²»´æÔÚ¶Ô³ÆÔ¤²â¿éÊ±£¬¶Ô³ÆSkip/DirectÄ£Ê½µÄÌî³ä */
+    /* ç›¸é‚»å—ä¸å­˜åœ¨å¯¹ç§°é¢„æµ‹å—æ—¶ï¼Œå¯¹ç§°Skip/Directæ¨¡å¼çš„å¡«å…… */
     if (sym_flag == 0) {
-        if (bid_flag > 1) {  /* Èô´æÔÚË«ÏòÔ¤²â¿é£¬ÔòÊ¹ÓÃË«ÏòÔ¤²â¿éÉú³É */
+        if (bid_flag > 1) {  /* è‹¥å­˜åœ¨åŒå‘é¢„æµ‹å—ï¼Œåˆ™ä½¿ç”¨åŒå‘é¢„æµ‹å—ç”Ÿæˆ */
             p_cumode->skip_mv_2nd[DS_B_SYM] = p_neighbors[bid2].mv[1];
             p_cumode->skip_mv_1st[DS_B_SYM] = p_neighbors[bid2].mv[0];
-        } else if (bw_flag != 0) {  /* Èô´æÔÚºóÏòÔ¤²â¿é£¬ÔòÊ¹ÓÃºóÏòÔ¤²â¿éÉú³É */
+        } else if (bw_flag != 0) {  /* è‹¥å­˜åœ¨åŽå‘é¢„æµ‹å—ï¼Œåˆ™ä½¿ç”¨åŽå‘é¢„æµ‹å—ç”Ÿæˆ */
             p_cumode->skip_mv_2nd[DS_B_SYM]   =  p_cumode->skip_mv_2nd[DS_B_BWD];
             p_cumode->skip_mv_1st[DS_B_SYM].x = -p_cumode->skip_mv_2nd[DS_B_BWD].x;
             p_cumode->skip_mv_1st[DS_B_SYM].y = -p_cumode->skip_mv_2nd[DS_B_BWD].y;
-        } else if (fwd_flag != 0) {  /* Èô´æÔÚÇ°ÏòÔ¤²â¿é£¬ÔòÊ¹ÓÃÇ°ÏòÔ¤²â¿éÉú³É */
+        } else if (fwd_flag != 0) {  /* è‹¥å­˜åœ¨å‰å‘é¢„æµ‹å—ï¼Œåˆ™ä½¿ç”¨å‰å‘é¢„æµ‹å—ç”Ÿæˆ */
             p_cumode->skip_mv_2nd[DS_B_SYM].x = -p_cumode->skip_mv_1st[DS_B_FWD].x;
             p_cumode->skip_mv_2nd[DS_B_SYM].y = -p_cumode->skip_mv_1st[DS_B_FWD].y;
             p_cumode->skip_mv_1st[DS_B_SYM]   =  p_cumode->skip_mv_1st[DS_B_FWD];
@@ -177,16 +177,16 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo
     }
     p_cumode->skip_ref_1st[DS_B_SYM] = B_FWD;
     p_cumode->skip_ref_2nd[DS_B_SYM] = B_BWD;
-    /* ºóÏòÔ¤²â¿é²»´æÔÚÊ±ºóÏòSkip/DirectÄ£Ê½µÄÌî³ä */
-    if (bw_flag == 0 && bid_flag > 1) {  /* Èç¹û´æÔÚË«ÏòÔ¤²â¿é£¬ÔòÊ¹ÓÃË«ÏòÔ¤²â¿éÄæÐòµÄ×îºóÒ»¸öÔªËØ */
+    /* åŽå‘é¢„æµ‹å—ä¸å­˜åœ¨æ—¶åŽå‘Skip/Directæ¨¡å¼çš„å¡«å…… */
+    if (bw_flag == 0 && bid_flag > 1) {  /* å¦‚æžœå­˜åœ¨åŒå‘é¢„æµ‹å—ï¼Œåˆ™ä½¿ç”¨åŒå‘é¢„æµ‹å—é€†åºçš„æœ€åŽä¸€ä¸ªå…ƒç´  */
         p_cumode->skip_mv_2nd[DS_B_BWD] = p_neighbors[bid2].mv[1];
-    } else if (bw_flag == 0 && bid_flag != 0) {  /* Ö»ÓÐÒ»¸öË«ÏòÔ¤²â¿éÊ±£¬Ê¹ÓÃË«ÏòÁÐ±íµÄºóÏò */
+    } else if (bw_flag == 0 && bid_flag != 0) {  /* åªæœ‰ä¸€ä¸ªåŒå‘é¢„æµ‹å—æ—¶ï¼Œä½¿ç”¨åŒå‘åˆ—è¡¨çš„åŽå‘ */
         p_cumode->skip_mv_2nd[DS_B_BWD] = p_cumode->skip_mv_2nd[DS_B_BID];
     }
     p_cumode->skip_ref_1st[DS_B_BWD] = INVALID_REF;
     p_cumode->skip_ref_2nd[DS_B_BWD] = B_BWD;
 
-    /* Ç°ÏòÔ¤²â¿é²»´æÔÚÊ±Ç°ÏòSkip/DirectÄ£Ê½µÄÌî³ä£¬ÀàËÆºóÏòSkip/DirectÄ£Ê½ */
+    /* å‰å‘é¢„æµ‹å—ä¸å­˜åœ¨æ—¶å‰å‘Skip/Directæ¨¡å¼çš„å¡«å……ï¼Œç±»ä¼¼åŽå‘Skip/Directæ¨¡å¼ */
     if (fwd_flag == 0 && bid_flag > 1) {
         p_cumode->skip_mv_1st[DS_B_FWD] = p_neighbors[bid2].mv[0];
     } else if (fwd_flag == 0 && bid_flag != 0) {
@@ -757,8 +757,8 @@ int get_mv_predictors_bskip(xavs2_t *h, cu_t *p_cu)
         col_mv_pos  = (pic_block_y >> 4) * w_in_16x16 + (pic_block_x >> 4);
         col_blk_ref = col_ref[col_mv_pos];
         if (col_blk_ref == INVALID_REF) {
-            ///! 9.5.8.4.3 ÔË¶¯Ê¸Á¿µ¼³ö·½·¨2£ºÈç¹û±àÂë µ¥Ôª×ÓÀàÐÍÎª B_Skip_Bi£¬ÇÒÊ±ÓòPUµÄ²Î¿¼Ë÷ÒýÎª INVALID_REF
-            get_mvp_default(h, p_neighbors, &mv_1st, 0, &cur_cb, B_FWD);  // ÕâÀï´«µÝµÄref_idxÓ°Ïìp_me->pred_sad_space£¬µ«²»±»Ê¹ÓÃ
+            ///! 9.5.8.4.3 è¿åŠ¨çŸ¢é‡å¯¼å‡ºæ–¹æ³•2ï¼šå¦‚æžœç¼–ç  å•å…ƒå­ç±»åž‹ä¸º B_Skip_Biï¼Œä¸”æ—¶åŸŸPUçš„å‚è€ƒç´¢å¼•ä¸º INVALID_REF
+            get_mvp_default(h, p_neighbors, &mv_1st, 0, &cur_cb, B_FWD);  // è¿™é‡Œä¼ é€’çš„ref_idxå½±å“p_me->pred_sad_spaceï¼Œä½†ä¸è¢«ä½¿ç”¨
             get_mvp_default(h, p_neighbors, &mv_2nd, 1, &cur_cb, B_BWD);
         } else {
             int TRp = h->fref[B_BWD]->ref_dpoc[col_blk_ref];
@@ -879,7 +879,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m
     int mv_mempos_x;
     int mv_mempos_y;
     mv_t mv;
-    int b_mv_valid;              // MVÊÇ·ñÓÐÐ§£º´óÐ¡È¡ÖµÊÇ·ñÔÚ±ê×¼¹æ¶¨µÄÓÐÐ§·¶Î§ÄÚ
+    int b_mv_valid;              // MVæ˜¯å¦æœ‰æ•ˆï¼šå¤§å°å–å€¼æ˜¯å¦åœ¨æ ‡å‡†è§„å®šçš„æœ‰æ•ˆèŒƒå›´å†…
     int pu_idx_x = p_cb->x != 0; // PU index in CU
     int pu_idx_y = p_cb->y != 0;
     int pu_idx = (pu_idx_y << 1) + pu_idx_x;
@@ -895,12 +895,16 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m
     int max_ref = h->i_ref;
 
     *fwd_cost = MAX_DISTORTION;
-    mv_mempos_x = (pix_x + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT;  // ¿¼ÂÇµ½8x8¿éµÄ·Ç¶Ô³Æ»®·Ö£¬ÐèÒª×öÒ»¸ö²¹³¥ÔÙÒÆÎ»
+    mv_mempos_x = (pix_x + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT;  // è€ƒè™‘åˆ°8x8å—çš„éžå¯¹ç§°åˆ’åˆ†ï¼Œéœ€è¦åšä¸€ä¸ªè¡¥å¿å†ç§»ä½
     mv_mempos_y = (pix_y + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT;
     all_min_costs = &h->all_mincost[mv_mempos_y * width_in_4x4 + mv_mempos_x];
 
     /* make p_fenc point to the start address of the current PU */
-    p_me->p_fenc  = h->lcu.p_fenc[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x;
+    if (h->param->input_sample_bit_depth == 8) {
+    p_me->p_fenc8  = h->lcu.p_fenc8[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x;
+    } else {
+    p_me->p_fenc10  = h->lcu.p_fenc10[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x;
+    }
     p_me->i_pixel = PART_INDEX(bsx, bsy);
     p_me->i_pix_x   = pix_x;
     p_me->i_pix_y   = pix_y;
@@ -938,7 +942,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m
             get_mvp_default(h, p_neighbors, pred_mv, bwd_2nd, p_cb, ref_idx);
         }
 
-        // ÐèÔÚ MVP »ñÈ¡Ö®ºóÖ´ÐÐ£¬Á½Õß¶¼»áÉèÖÃ p_me ×´Ì¬
+        // éœ€åœ¨ MVP èŽ·å–ä¹‹åŽæ‰§è¡Œï¼Œä¸¤è€…éƒ½ä¼šè®¾ç½® p_me çŠ¶æ€
         p_me->i_ref_idx = (int16_t)ref_idx;
         if (h->param->me_method == XAVS2_ME_UMH) {
             fast_me_prepare_info(h, p_me, mode, ref_idx, pu_idx, all_min_costs[0]);
@@ -949,11 +953,11 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m
         p_me->p_fref_1st = p_ref_frm;
         p_me->mvp.v  = pred_mv->v;
 
-        /* ÏÞÖÆMVPµÄÈ¡Öµ£¬Èç¹ûMVPÖµ¹ý´ó£¬Ôò²»×öME */
+        /* é™åˆ¶MVPçš„å–å€¼ï¼Œå¦‚æžœMVPå€¼è¿‡å¤§ï¼Œåˆ™ä¸åšME */
         b_mv_valid = check_mv_range(h, pred_mv, ref_idx, pix_x, pix_y, bsx, bsy);
         b_mv_valid &= check_mvd(h, pred_mv->x, pred_mv->y);
 
-        /* Ä¬ÈÏ±ØÐëËÑË÷µÄµãÎ»ÖÃ */
+        /* é»˜è®¤å¿…é¡»æœç´¢çš„ç‚¹ä½ç½® */
         i_mvc = 0;
         i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, p_me->mvp.x, p_me->mvp.y);
         i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, 0, 0);
@@ -961,7 +965,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m
         if (b_mv_valid) {
             cost = xavs2_me_search(h, p_me, mvc, i_mvc);
         } else {
-            p_me->bmv = p_me->mvp;  // MVPÔ½½çÊ±£¬×îÓÅMVÉèÖÃ³ÉºÍMVPÒ»Ñù´óÐ¡
+            p_me->bmv = p_me->mvp;  // MVPè¶Šç•Œæ—¶ï¼Œæœ€ä¼˜MVè®¾ç½®æˆå’ŒMVPä¸€æ ·å¤§å°
             cost = MAX_DISTORTION;
         }
         mv = p_me->bmv;
@@ -1039,11 +1043,10 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me,
     mv_t mvp, mv;
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
     cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode];
-    pel_t *buf_pixel_temp = p_enc->buf_pixel_temp;
     int pu_size_shift = p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT;
     dist_t cost, cost_bid;
     int m, n, i, j;
-    int b_mv_valid;                    // MVÊÇ·ñÓÐÐ§£º´óÐ¡È¡ÖµÊÇ·ñÔÚ±ê×¼¹æ¶¨µÄÓÐÐ§·¶Î§ÄÚ
+    int b_mv_valid;                    // MVæ˜¯å¦æœ‰æ•ˆï¼šå¤§å°å–å€¼æ˜¯å¦åœ¨æ ‡å‡†è§„å®šçš„æœ‰æ•ˆèŒƒå›´å†…
     int pu_idx_x = p_cb->x != 0;       // PU index in CU
     int pu_idx_y = p_cb->y != 0;
     int k = (pu_idx_y << 1) + pu_idx_x;
@@ -1080,8 +1083,10 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me,
     b_mv_valid &= check_mv_range_sym(h,  &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd);
     b_mv_valid &= check_mvd(h, mvp.x, mvp.y);  // avoid mv-bits calculation error
 
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *buf_pixel_temp = p_enc->buf_pixel_temp8;
     if (b_mv_valid) {
-        cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &mv);
+        cost = xavs2_me_search_sym8(h, p_me, buf_pixel_temp, &mv);
     } else {
         cost = MAX_DISTORTION;
     }
@@ -1091,7 +1096,7 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me,
     b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y);  // avoid mv-bits calculation error
     b_mv_valid &= check_mvd(h, p_me->mvp2.x, p_me->mvp2.y);
     if (b_mv_valid) {
-        cost_bid = xavs2_me_search_bid(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc);
+        cost_bid = xavs2_me_search_bid8(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc);
     } else {
         cost_bid = MAX_DISTORTION;
     }
@@ -1127,6 +1132,56 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me,
 
     *sym_mcost = cost;
     *bid_mcost = cost_bid;
+    } else {
+    pel10_t *buf_pixel_temp = p_enc->buf_pixel_temp10;
+    if (b_mv_valid) {
+        cost = xavs2_me_search_sym10(h, p_me, buf_pixel_temp, &mv);
+    } else {
+        cost = MAX_DISTORTION;
+    }
+
+    b_mv_valid  = check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy);
+    b_mv_valid &= check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy);
+    b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y);  // avoid mv-bits calculation error
+    b_mv_valid &= check_mvd(h, p_me->mvp2.x, p_me->mvp2.y);
+    if (b_mv_valid) {
+        cost_bid = xavs2_me_search_bid10(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc);
+    } else {
+        cost_bid = MAX_DISTORTION;
+    }
+
+    // store motion vectors
+    m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1);
+    n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1);
+    for (j = 0; j < n; j++) {
+        for (i = 0; i < m; i++) {
+            k = ((pu_idx_y + j) << 1) + (pu_idx_x + i);
+            p_mode_mv[k].all_sym_mv     [0] = mv;
+            p_mode_mv[k].all_dual_mv_1st[0] = fwd_mv;
+            p_mode_mv[k].all_dual_mv_2nd[0] = bwd_mv;
+        }
+    }
+
+    if (!(check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy) &&
+          check_mvd(h, (fwd_mv.x - p_me->mvp1.x), (fwd_mv.y - p_me->mvp1.y)))) {
+        cost_bid = MAX_DISTORTION;
+    }
+
+    if (!(check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy) &&
+          check_mvd(h, (bwd_mv.x - p_me->mvp2.x), (bwd_mv.y - p_me->mvp2.y)))) {
+        cost_bid = MAX_DISTORTION;
+    }
+
+    if (!(check_mv_range_sym(h, &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd) &&
+          check_mvd(h, (mv.x - mvp.x), (mv.y - mvp.y)))) {
+        cost = MAX_DISTORTION;
+    }
+    p_me->bmvcost[PDIR_SYM] = p_me->mvcost[PDIR_SYM];
+    p_me->bmvcost[PDIR_BID] = p_me->mvcost[PDIR_BID];
+
+    *sym_mcost = cost;
+    *bid_mcost = cost_bid;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1139,7 +1194,6 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me
     mv_t fst_dual, snd_dual;
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
     cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode];
-    pel_t *buf_pixel_temp = p_enc->buf_pixel_temp;
     int pix_x = p_cu->i_pix_x + p_cb->x;
     int pix_y = p_cu->i_pix_y + p_cb->y;
     int pu_idx_x = p_cb->x != 0;           // PU index
@@ -1150,7 +1204,7 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me
     int ref_idx;
     dist_t cost;
     int distance_fwd, distance_bwd;
-    int b_mv_valid;        // MVÊÇ·ñÓÐÐ§£º´óÐ¡È¡ÖµÊÇ·ñÔÚ±ê×¼¹æ¶¨µÄÓÐÐ§·¶Î§ÄÚ
+    int b_mv_valid;        // MVæ˜¯å¦æœ‰æ•ˆï¼šå¤§å°å–å€¼æ˜¯å¦åœ¨æ ‡å‡†è§„å®šçš„æœ‰æ•ˆèŒƒå›´å†…
     int m, n, i, j, k;
     int max_ref = h->i_ref;
 
@@ -1182,8 +1236,10 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me
         b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y));
         b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y);
         b_mv_valid &= check_mvd(h, p_me->mvp.x, p_me->mvp.y);
+        if (h->param->input_sample_bit_depth == 8) {
+        pel8_t *buf_pixel_temp = p_enc->buf_pixel_temp8;
         if (b_mv_valid) {
-            cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &fst_dual);
+            cost = xavs2_me_search_sym8(h, p_me, buf_pixel_temp, &fst_dual);
         } else {
             cost = MAX_DISTORTION;
         }
@@ -1215,6 +1271,42 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me
                 p_me->bmvcost[PDIR_DUAL] = p_me->mvcost[PDIR_SYM];
             }
         }
+        } else {
+        pel10_t *buf_pixel_temp = p_enc->buf_pixel_temp10;
+        if (b_mv_valid) {
+            cost = xavs2_me_search_sym10(h, p_me, buf_pixel_temp, &fst_dual);
+        } else {
+            cost = MAX_DISTORTION;
+        }
+
+        /* store motion vectors and reference frame (for motion vector prediction) */
+        snd_dual.v = MAKEDWORD(scale_mv_skip  (   fst_dual.x, distance_bwd, distance_fwd),
+                               scale_mv_skip_y(h, fst_dual.y, distance_bwd, distance_fwd));
+
+        m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1);
+        n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1);
+        for (j = 0; j < n; j++) {
+            for (i = 0; i < m; i++) {
+                k = ((pu_idx_y + j) << 1) + (pu_idx_x + i);
+                p_mode_mv[k].all_dual_mv_1st[ref_idx] = fst_dual;
+                p_mode_mv[k].all_dual_mv_2nd[ref_idx] = snd_dual;
+            }
+        }
+
+        b_mv_valid &= check_mv_range_sym(h, &fst_dual, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd);
+        b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y));
+        if (!b_mv_valid) {
+            cost = MAX_DISTORTION;
+        } else {
+            cost += REF_COST(ref_idx);
+            if (cost < *dual_mcost) {
+                *dual_mcost = cost;
+                *dual_best_fst_ref = ref_idx;
+                *dual_best_snd_ref = !ref_idx;
+                p_me->bmvcost[PDIR_DUAL] = p_me->mvcost[PDIR_SYM];
+            }
+        }
+        }
     }
 }
 
diff --git a/source/encoder/md_intra.c b/source/encoder/md_intra.c
index 08999e5..37753d9 100644
--- a/source/encoder/md_intra.c
+++ b/source/encoder/md_intra.c
@@ -62,14 +62,14 @@ uint32_t get_intra_neighbors(xavs2_t *h, int x_4x4, int y_4x4, int bsx, int bsy,
     const int lcu_mask = (1 << (h->i_lcu_level - 2)) - 1;
     int leftdown, topright;
 
-    /* 1. ¼ì²éÏàÁÚ¿éÊÇ·ñÊôÓÚÍ¬Ò»¸öSlice */
+    /* 1. æ£€æŸ¥ç›¸é‚»å—æ˜¯å¦å±žäºŽåŒä¸€ä¸ªSlice */
     uint32_t b_LEFT      = is_block_available(h, x_4x4, y_4x4, -1,  0, cur_slice_idx);
     uint32_t b_TOP       = is_block_available(h, x_4x4, y_4x4,  0, -1, cur_slice_idx);
     uint32_t b_TOP_LEFT  = is_block_available(h, x_4x4, y_4x4, -1, -1, cur_slice_idx);
     uint32_t b_TOP_RIGHT = is_block_available(h, x_4x4, y_4x4, (bsx >> 1) - 1, -1, cur_slice_idx);   // (bsx >> MIN_PU_SIZE_IN_BIT << 1)
     uint32_t b_LEFT_DOWN = is_block_available(h, x_4x4, y_4x4, -1, (bsy >> 1) - 1, cur_slice_idx);   // (bsy >> MIN_PU_SIZE_IN_BIT << 1)
 
-    /* 2. ¼ì²éÏàÁÚ¿éÊÇ·ñÔÚµ±Ç°¿éÖ®Ç°ÖØ¹¹ */
+    /* 2. æ£€æŸ¥ç›¸é‚»å—æ˜¯å¦åœ¨å½“å‰å—ä¹‹å‰é‡æž„ */
     x_4x4   &= lcu_mask;
     y_4x4   &= lcu_mask;
     leftdown = h->tab_avail_DL[((y_4x4 + (bsy >> 2) - 1) << (h->i_lcu_level - B4X4_IN_BIT)) + (x_4x4)];
@@ -101,29 +101,29 @@ uint32_t get_intra_pu_avail(cu_t *p_cu, int block_x, int block_y, int bsx, int b
             avail = (avail & (~(1 << MD_I_LEFT_DOWN))) | (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT) << MD_I_LEFT_DOWN);
         }
     } else if (block_y == 0) {
-        avail = (cu_avail & (1 << MD_I_TOP));  // ÉÏ±ß½çÓÉCUµÄÉÏ±ß½ç¾ö¶¨£»×óÏÂ¾ù²»¿ÉÓÃ
-        avail |= (1 << MD_I_LEFT);             // ×ó±ß½ç¾ù¿ÉÓÃ
-        avail |= ((cu_avail >> MD_I_TOP) & 1) << MD_I_TOP_LEFT;  // ×óÉÏÓÉCUÉÏ±ß½ç¿ÉÓÃÐÔ¾ö¶¨
-        if (block_x + bsx < cu_size) {  // ÓÒÉÏÓÉCUÉÏ±ß½çºÍÓÒÉÏ±ß½ç¾ö¶¨
+        avail = (cu_avail & (1 << MD_I_TOP));  // ä¸Šè¾¹ç•Œç”±CUçš„ä¸Šè¾¹ç•Œå†³å®šï¼›å·¦ä¸‹å‡ä¸å¯ç”¨
+        avail |= (1 << MD_I_LEFT);             // å·¦è¾¹ç•Œå‡å¯ç”¨
+        avail |= ((cu_avail >> MD_I_TOP) & 1) << MD_I_TOP_LEFT;  // å·¦ä¸Šç”±CUä¸Šè¾¹ç•Œå¯ç”¨æ€§å†³å®š
+        if (block_x + bsx < cu_size) {  // å³ä¸Šç”±CUä¸Šè¾¹ç•Œå’Œå³ä¸Šè¾¹ç•Œå†³å®š
             avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_TOP)) << MD_I_TOP_RIGHT;
         } else {
             avail |= cu_avail & (1 << MD_I_TOP_RIGHT);
         }
     } else if (block_x == 0) {
-        avail = (cu_avail & (1 << MD_I_LEFT)); // ×ó±ß½çÓÉCUµÄ×ó±ß½ç¾ö¶¨
-        avail |= (1 << MD_I_TOP);              // ÉÏ±ß½ç¾ù¿ÉÓÃ
-        avail |= ((cu_avail >> MD_I_LEFT) & 1) << MD_I_TOP_LEFT;  // ×óÉÏÓÉCUÉÏ±ß½ç¿ÉÓÃÐÔ¾ö¶¨
-        if (bsx < cu_size && bsy < cu_size) {  // ÓÒÉÏ
+        avail = (cu_avail & (1 << MD_I_LEFT)); // å·¦è¾¹ç•Œç”±CUçš„å·¦è¾¹ç•Œå†³å®š
+        avail |= (1 << MD_I_TOP);              // ä¸Šè¾¹ç•Œå‡å¯ç”¨
+        avail |= ((cu_avail >> MD_I_LEFT) & 1) << MD_I_TOP_LEFT;  // å·¦ä¸Šç”±CUä¸Šè¾¹ç•Œå¯ç”¨æ€§å†³å®š
+        if (bsx < cu_size && bsy < cu_size) {  // å³ä¸Š
             avail |= 1 << MD_I_TOP_RIGHT;
         }
-        // ×óÏÂ
+        // å·¦ä¸‹
         if (block_y + bsy < cu_size) {
             avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT)) << MD_I_LEFT_DOWN;
         } else {
             avail |= cu_avail & (1 << MD_I_LEFT_DOWN);
         }
     } else {
-        // ÓÒÉÏ¡¢×óÏÂ²»¿ÉÓÃ
+        // å³ä¸Šã€å·¦ä¸‹ä¸å¯ç”¨
         avail = (1 << MD_I_LEFT) | (1 << MD_I_TOP) | (1 << MD_I_TOP_LEFT);
     }
 
@@ -134,20 +134,20 @@ uint32_t get_intra_pu_avail(cu_t *p_cu, int block_x, int block_y, int bsx, int b
  * fill reference samples for luma component
  */
 static INLINE
-void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP,
+void fill_ref_samples_luma8(xavs2_t *h, cu_t *p_cu, pel8_t *EP,
                            int img_x, int img_y,
                            int block_x, int block_y,
                            int bsx, int bsy)
 {
     int pos_x = (img_x - h->lcu.i_pix_x - 1);
     int pos_y = (img_y - h->lcu.i_pix_y - 1);
-    pel_t *pTL = h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x;
+    pel8_t *pTL = h->lcu.p_fdec8[0] + pos_y * FDEC_STRIDE + pos_x;
     int xy = (((pos_y + 1) != 0) << 1) + ((pos_x + 1) != 0);
     uint32_t avail;
 
-    /* 1, ¼ì²é²Î¿¼±ß½çÓÐÐ§ÐÔ */
+    /* 1, æ£€æŸ¥å‚è€ƒè¾¹ç•Œæœ‰æ•ˆæ€§ */
     if (img_x + 2 * bsx <= h->i_width && img_y + 2 * bsy <= h->i_height
-        && 0) {  // TODO: ¸ßµµ´ÎÏÂ²»Æ¥Åä£¬ÈÔ²ÉÓÃÔ­ÏÈÄ¬ÈÏÄ£Ê½
+        && 0) {  // TODO: é«˜æ¡£æ¬¡ä¸‹ä¸åŒ¹é…ï¼Œä»é‡‡ç”¨åŽŸå…ˆé»˜è®¤æ¨¡å¼
         avail = get_intra_pu_avail(p_cu, block_x, block_y, bsx, bsy);
     } else {
         int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT);
@@ -159,8 +159,38 @@ void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP,
 
     p_cu->block_avail = (uint8_t)avail;
 
-    /* 2, Íê³É²Î¿¼±ß½çÏñËØµÄÌî³ä */
-    g_funcs.fill_edge_f[xy](pTL, FDEC_STRIDE, h->lcu.ctu_border[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy);
+    /* 2, å®Œæˆå‚è€ƒè¾¹ç•Œåƒç´ çš„å¡«å…… */
+    g_funcs.fill_edge8_f[xy](h, pTL, FDEC_STRIDE, h->lcu.ctu_border8[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy);
+}
+
+static INLINE
+void fill_ref_samples_luma10(xavs2_t *h, cu_t *p_cu, pel10_t *EP,
+                           int img_x, int img_y,
+                           int block_x, int block_y,
+                           int bsx, int bsy)
+{
+    int pos_x = (img_x - h->lcu.i_pix_x - 1);
+    int pos_y = (img_y - h->lcu.i_pix_y - 1);
+    pel10_t *pTL = h->lcu.p_fdec10[0] + pos_y * FDEC_STRIDE + pos_x;
+    int xy = (((pos_y + 1) != 0) << 1) + ((pos_x + 1) != 0);
+    uint32_t avail;
+
+    /* 1, æ£€æŸ¥å‚è€ƒè¾¹ç•Œæœ‰æ•ˆæ€§ */
+    if (img_x + 2 * bsx <= h->i_width && img_y + 2 * bsy <= h->i_height
+        && 0) {  // TODO: é«˜æ¡£æ¬¡ä¸‹ä¸åŒ¹é…ï¼Œä»é‡‡ç”¨åŽŸå…ˆé»˜è®¤æ¨¡å¼
+        avail = get_intra_pu_avail(p_cu, block_x, block_y, bsx, bsy);
+    } else {
+        int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT);
+        int b8_x = img_x >> MIN_PU_SIZE_IN_BIT;
+        int b8_y = img_y >> MIN_PU_SIZE_IN_BIT;
+
+        avail = get_intra_neighbors(h, b8_x, b8_y, bsx, bsy, cur_slice_idx);
+    }
+
+    p_cu->block_avail = (uint8_t)avail;
+
+    /* 2, å®Œæˆå‚è€ƒè¾¹ç•Œåƒç´ çš„å¡«å…… */
+    g_funcs.fill_edge10_f[xy](h, pTL, FDEC_STRIDE, h->lcu.ctu_border10[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy);
 }
 
 /* ---------------------------------------------------------------------------
@@ -169,18 +199,34 @@ void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP,
  * \param dst: aligned to 32-byte
  */
 static INLINE
-void xavs2_intra_prediction(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy)
+void xavs2_intra_prediction8(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy)
+{
+    //UNUSED_PARAMETER(h);
+
+    if (dir_mode != DC_PRED) {
+        g_funcs.intraf8[dir_mode](h, src, dst, i_dst, dir_mode, bsx, bsy);
+    } else {
+        int b_top  = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP);
+        int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT);
+        int mode_ex = ((b_top << 8) + b_left);
+
+        g_funcs.intraf8[dir_mode](h, src, dst, i_dst, mode_ex, bsx, bsy);
+    }
+}
+
+static INLINE
+void xavs2_intra_prediction10(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy)
 {
-    UNUSED_PARAMETER(h);
+    //UNUSED_PARAMETER(h);
 
     if (dir_mode != DC_PRED) {
-        g_funcs.intraf[dir_mode](src, dst, i_dst, dir_mode, bsx, bsy);
+        g_funcs.intraf10[dir_mode](h, src, dst, i_dst, dir_mode, bsx, bsy);
     } else {
         int b_top  = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP);
         int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT);
         int mode_ex = ((b_top << 8) + b_left);
 
-        g_funcs.intraf[dir_mode](src, dst, i_dst, mode_ex, bsx, bsy);
+        g_funcs.intraf10[dir_mode](h, src, dst, i_dst, mode_ex, bsx, bsy);
     }
 }
 
@@ -212,40 +258,78 @@ void update_candidate_list(int mode, rdcost_t cost, int max_num, intra_candidate
 /* ---------------------------------------------------------------------------
  * used for generating intra luma prediction samples
  */
-#define PREDICT_ADD_LUMA(MODE_IDX) \
+
+#define PREDICT_ADD_LUMA8(MODE_IDX) \
 {\
-    pel_t *p_pred = p_enc->intra_pred[MODE_IDX];\
+    pel8_t *p_pred = p_enc->intra8_pred[MODE_IDX];\
     int mode_bits = (mpm[0] == (MODE_IDX) || mpm[1] == (MODE_IDX)) ? 2 : 6;\
     rdcost_t cost = h->f_lambda_mode * mode_bits; \
     \
-    xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, MODE_IDX,\
+    xavs2_intra_prediction8(h, edge_pixels, p_pred, block_w, MODE_IDX,\
         p_cu->block_avail, block_w, block_h);\
-    cost += intra_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\
+    cost += intra8_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\
+    update_candidate_list(MODE_IDX, cost, INTRA_MODE_NUM_FOR_RDO, p_candidates);\
+}
+
+#define PREDICT_ADD_LUMA10(MODE_IDX) \
+{\
+    pel10_t *p_pred = p_enc->intra10_pred[MODE_IDX];\
+    int mode_bits = (mpm[0] == (MODE_IDX) || mpm[1] == (MODE_IDX)) ? 2 : 6;\
+    rdcost_t cost = h->f_lambda_mode * mode_bits; \
+    \
+    xavs2_intra_prediction10(h, edge_pixels, p_pred, block_w, MODE_IDX,\
+        p_cu->block_avail, block_w, block_h);\
+    cost += intra10_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\
     update_candidate_list(MODE_IDX, cost, INTRA_MODE_NUM_FOR_RDO, p_candidates);\
 }
 
 /* ---------------------------------------------------------------------------
  * return numbers for RDO and candidate list by scanning all the intra modes
  */
-int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                            pel_t *p_fenc, int mpm[], int blockidx,
+int rdo_get_pred_intra_luma8(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                            pel8_t *p_fenc, int mpm[], int blockidx,
+                            int block_x, int block_y, int block_w, int block_h)
+{
+    pixel8_cmp_t intra8_cmp = g_funcs.pixf.intra8_cmp[PART_INDEX(block_w, block_h)];
+    cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
+    pel8_t *edge_pixels   = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1];
+    int mode;
+    int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x;
+    int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
+
+    /* get edge samples for intra prediction */
+    fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+
+    UNUSED_PARAMETER(blockidx);
+
+    /* loop over all intra predication modes */
+    for (mode = 0; mode < NUM_INTRA_MODE; mode++) {
+        PREDICT_ADD_LUMA8(mode);
+    }
+
+    p_cu->feature.intra_had_cost = p_candidates[0].cost;
+    return h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)];
+}
+
+int rdo_get_pred_intra_luma10(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                            pel10_t *p_fenc, int mpm[], int blockidx,
                             int block_x, int block_y, int block_w, int block_h)
 {
-    pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)];
+    pixel10_cmp_t intra10_cmp = g_funcs.pixf.intra10_cmp[PART_INDEX(block_w, block_h)];
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
-    pel_t *edge_pixels   = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1];
+    pel10_t *edge_pixels   = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1];
     int mode;
     int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x;
     int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
 
     /* get edge samples for intra prediction */
-    fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+    fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
 
     UNUSED_PARAMETER(blockidx);
 
     /* loop over all intra predication modes */
     for (mode = 0; mode < NUM_INTRA_MODE; mode++) {
-        PREDICT_ADD_LUMA(mode);
+        PREDICT_ADD_LUMA10(mode);
     }
 
     p_cu->feature.intra_had_cost = p_candidates[0].cost;
@@ -255,16 +339,16 @@ int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candida
 /* ---------------------------------------------------------------------------
  * return numbers for RDO and candidate list by rough scanning
  */
-int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                pel_t *p_fenc, int mpm[], int blockidx,
+int rdo_get_pred_intra_luma8_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                pel8_t *p_fenc, int mpm[], int blockidx,
                                 int block_x, int block_y, int block_w, int block_h)
 {
     int visited[NUM_INTRA_MODE] = { 0 };    /* 0: not visited yet
                                              * 1: visited in the first phase
                                              * 2: visited in final_mode */
-    pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)];
+    pixel8_cmp_t intra8_cmp = g_funcs.pixf.intra8_cmp[PART_INDEX(block_w, block_h)];
     cu_parallel_t *p_enc  = cu_get_enc_context(h, p_cu->cu_info.i_level);
-    pel_t *edge_pixels    = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1];
+    pel8_t *edge_pixels    = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1];
     int mode, i, j;
     int num_angle = 0;
     int num_for_rdo;
@@ -273,23 +357,23 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can
     int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
 
     /* get edge samples for intra prediction */
-    fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+    fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
 
     UNUSED_PARAMETER(blockidx);
 
-    /* 1, ±éÀú»ù´¡Ä£Ê½£¬
-     * (1.1) ¼¸¸ö¹Ø¼üµÄ½Ç¶È */
+    /* 1, éåŽ†åŸºç¡€æ¨¡å¼ï¼Œ
+     * (1.1) å‡ ä¸ªå…³é”®çš„è§’åº¦ */
     for (mode = 0; mode < 3; mode++) {
-        PREDICT_ADD_LUMA(mode);
+        PREDICT_ADD_LUMA8(mode);
         visited[mode] = 1;
     }
-    /* (1.2) ½Ç¶ÈÔ¤²âÄ£Ê½ */
+    /* (1.2) è§’åº¦é¢„æµ‹æ¨¡å¼ */
     for (mode = 4; mode < NUM_INTRA_MODE; mode += 4) {
-        PREDICT_ADD_LUMA(mode);
+        PREDICT_ADD_LUMA8(mode);
         visited[mode] = 1;
     }
 
-    /* 2, ±éÀúN¸ö×îÓÅµÄÄ£Ê½µÄ¾àÀëÎª¶þµÄÄ£Ê½£¬Èç¹û½ÏÓÅÔò·Åµ½CandModeListÖÐ */
+    /* 2, éåŽ†Nä¸ªæœ€ä¼˜çš„æ¨¡å¼çš„è·ç¦»ä¸ºäºŒçš„æ¨¡å¼ï¼Œå¦‚æžœè¾ƒä¼˜åˆ™æ”¾åˆ°CandModeListä¸­ */
     num_to_add = h->num_intra_rmd_dist2;
     for (i = 0; i < num_to_add; i++) {
         mode = p_candidates[i].mode;
@@ -299,18 +383,18 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can
 
         if (mode > 3 && !visited[mode - 2]) {
             j = mode - 2;
-            PREDICT_ADD_LUMA(j);
+        PREDICT_ADD_LUMA8(j);
             visited[j] = 1;
         }
 
         if (mode < NUM_INTRA_MODE - 2 && !visited[mode + 2]) {
             j = mode + 2;
-            PREDICT_ADD_LUMA(j);
+        PREDICT_ADD_LUMA8(j);
             visited[j] = 1;
         }
     }
 
-    /* 3, °ÑÒÔÉÏµÃµ½µÄ×î¼ÑµÄÁ½¸öÄ£Ê½µÄ¾àÀëÎªÒ»µÄÄ£Ê½·ÅÔÚCandModeListÖÐ */
+    /* 3, æŠŠä»¥ä¸Šå¾—åˆ°çš„æœ€ä½³çš„ä¸¤ä¸ªæ¨¡å¼çš„è·ç¦»ä¸ºä¸€çš„æ¨¡å¼æ”¾åœ¨CandModeListä¸­ */
     num_to_add = h->num_intra_rmd_dist1;
     for (i = 0, num_angle = 0; num_angle < num_to_add && i < INTRA_MODE_NUM_FOR_RDO; i++) {
         mode = p_candidates[i].mode;
@@ -320,42 +404,179 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can
 
         if (mode > 3 && !visited[mode - 1]) {
             j = mode - 1;
-            PREDICT_ADD_LUMA(j);
+        PREDICT_ADD_LUMA8(j);
             visited[j] = 1;
             num_angle++;
         }
 
         if (mode < NUM_INTRA_MODE - 1 && !visited[mode + 1]) {
             j = mode + 1;
-            PREDICT_ADD_LUMA(j);
+        PREDICT_ADD_LUMA8(j);
             visited[j] = 1;
             num_angle++;
         }
     }
 
-    /* 4, ²éÕÒ×îÓÅÁÐ±íÖÐÊÇ·ñÓÐMPMs£¬ÈôÃ»ÓÐ£¬Ôò¼ÓÈë£¬ÈôÓÐÔò²»ÓÃ¼ÓÈë */
+    /* 4, æŸ¥æ‰¾æœ€ä¼˜åˆ—è¡¨ä¸­æ˜¯å¦æœ‰MPMsï¼Œè‹¥æ²¡æœ‰ï¼Œåˆ™åŠ å…¥ï¼Œè‹¥æœ‰åˆ™ä¸ç”¨åŠ å…¥ */
     if (!visited[mpm[0]]) {
         mode = mpm[0];
-        PREDICT_ADD_LUMA(mode);
+        PREDICT_ADD_LUMA8(mode);
         visited[mode] = 1;
     }
 
     if (!visited[mpm[1]]) {
         mode = mpm[1];
-        PREDICT_ADD_LUMA(mode);
+        PREDICT_ADD_LUMA8(mode);
         visited[mode] = 1;
     }
 
     num_for_rdo = h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)];
 
-    /* Èôµ±Ç°¾Ö²¿×îÓÅµÄÁ½¸öÄ£Ê½ÊÇMPMÖ®Ò»£¬Ôò¼õÉÙRDOÄ£Ê½ÊýÁ¿ */
+    /* è‹¥å½“å‰å±€éƒ¨æœ€ä¼˜çš„ä¸¤ä¸ªæ¨¡å¼æ˜¯MPMä¹‹ä¸€ï¼Œåˆ™å‡å°‘RDOæ¨¡å¼æ•°é‡ */
     if (p_candidates[0].mode == mpm[0] || p_candidates[0].mode == mpm[1] ||
         p_candidates[1].mode == mpm[0] || p_candidates[1].mode == mpm[1]) {
         num_for_rdo = XAVS2_MIN(num_for_rdo, 3);
         return num_for_rdo;
     }
 
-    /* ´ÓM¸ö×îÓÅÄ£Ê½ÖÐÑ¡¶¨×îÖÕ²Î¼ÓRDOµÄÄ£Ê½£¬¼´È¥ÖØ */
+    /* ä»ŽMä¸ªæœ€ä¼˜æ¨¡å¼ä¸­é€‰å®šæœ€ç»ˆå‚åŠ RDOçš„æ¨¡å¼ï¼Œå³åŽ»é‡ */
+    visited[p_candidates[0].mode] = 2;
+    visited[p_candidates[1].mode] = 2;
+
+    for (i = 2, j = 2; i < INTRA_MODE_NUM_FOR_RDO && j < num_for_rdo; i++) {
+        mode = p_candidates[i].mode;
+        if (!visited[mode]) {
+            continue;
+        }
+        if (mode <= 2) {
+            p_candidates[j++].mode = mode;
+            visited[mode] = 2;
+        } else if (mode == 3) {
+            if (visited[4] == 1) {
+                p_candidates[j++].mode = 3;
+                visited[3] = 2;
+            }
+        } else if (mode == 32) {
+            if (visited[31] == 1) {
+                p_candidates[j++].mode = 32;
+                visited[32] = 2;
+            }
+        } else {
+            if (visited[mode - 1] == 1 && visited[mode + 1] == 1) {
+                p_candidates[j++].mode = mode;
+                visited[mode] = 2;
+            }
+        }
+        if (visited[0] == 2 && visited[1] == 2 && visited[2] == 2) {
+            break;
+        }
+    }
+
+    p_cu->feature.intra_had_cost = p_candidates[0].cost;
+    return XAVS2_MIN(num_for_rdo, j);
+}
+
+int rdo_get_pred_intra_luma10_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                pel10_t *p_fenc, int mpm[], int blockidx,
+                                int block_x, int block_y, int block_w, int block_h)
+{
+    int visited[NUM_INTRA_MODE] = { 0 };    /* 0: not visited yet
+                                             * 1: visited in the first phase
+                                             * 2: visited in final_mode */
+    pixel10_cmp_t intra10_cmp = g_funcs.pixf.intra10_cmp[PART_INDEX(block_w, block_h)];
+    cu_parallel_t *p_enc  = cu_get_enc_context(h, p_cu->cu_info.i_level);
+    pel10_t *edge_pixels    = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1];
+    int mode, i, j;
+    int num_angle = 0;
+    int num_for_rdo;
+    int num_to_add;
+    int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x;
+    int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
+
+    /* get edge samples for intra prediction */
+    fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+
+    UNUSED_PARAMETER(blockidx);
+
+    /* 1, éåŽ†åŸºç¡€æ¨¡å¼ï¼Œ
+     * (1.1) å‡ ä¸ªå…³é”®çš„è§’åº¦ */
+    for (mode = 0; mode < 3; mode++) {
+        PREDICT_ADD_LUMA10(mode);
+        visited[mode] = 1;
+    }
+    /* (1.2) è§’åº¦é¢„æµ‹æ¨¡å¼ */
+    for (mode = 4; mode < NUM_INTRA_MODE; mode += 4) {
+        PREDICT_ADD_LUMA10(mode);
+        visited[mode] = 1;
+    }
+
+    /* 2, éåŽ†Nä¸ªæœ€ä¼˜çš„æ¨¡å¼çš„è·ç¦»ä¸ºäºŒçš„æ¨¡å¼ï¼Œå¦‚æžœè¾ƒä¼˜åˆ™æ”¾åˆ°CandModeListä¸­ */
+    num_to_add = h->num_intra_rmd_dist2;
+    for (i = 0; i < num_to_add; i++) {
+        mode = p_candidates[i].mode;
+        if (mode <= 2) {
+            continue;
+        }
+
+        if (mode > 3 && !visited[mode - 2]) {
+            j = mode - 2;
+        PREDICT_ADD_LUMA10(j);
+            visited[j] = 1;
+        }
+
+        if (mode < NUM_INTRA_MODE - 2 && !visited[mode + 2]) {
+            j = mode + 2;
+        PREDICT_ADD_LUMA10(j);
+            visited[j] = 1;
+        }
+    }
+
+    /* 3, æŠŠä»¥ä¸Šå¾—åˆ°çš„æœ€ä½³çš„ä¸¤ä¸ªæ¨¡å¼çš„è·ç¦»ä¸ºä¸€çš„æ¨¡å¼æ”¾åœ¨CandModeListä¸­ */
+    num_to_add = h->num_intra_rmd_dist1;
+    for (i = 0, num_angle = 0; num_angle < num_to_add && i < INTRA_MODE_NUM_FOR_RDO; i++) {
+        mode = p_candidates[i].mode;
+        if (mode <= 2) {
+            continue;
+        }
+
+        if (mode > 3 && !visited[mode - 1]) {
+            j = mode - 1;
+        PREDICT_ADD_LUMA10(j);
+            visited[j] = 1;
+            num_angle++;
+        }
+
+        if (mode < NUM_INTRA_MODE - 1 && !visited[mode + 1]) {
+            j = mode + 1;
+        PREDICT_ADD_LUMA10(j);
+            visited[j] = 1;
+            num_angle++;
+        }
+    }
+
+    /* 4, æŸ¥æ‰¾æœ€ä¼˜åˆ—è¡¨ä¸­æ˜¯å¦æœ‰MPMsï¼Œè‹¥æ²¡æœ‰ï¼Œåˆ™åŠ å…¥ï¼Œè‹¥æœ‰åˆ™ä¸ç”¨åŠ å…¥ */
+    if (!visited[mpm[0]]) {
+        mode = mpm[0];
+        PREDICT_ADD_LUMA10(mode);
+        visited[mode] = 1;
+    }
+
+    if (!visited[mpm[1]]) {
+        mode = mpm[1];
+        PREDICT_ADD_LUMA10(mode);
+        visited[mode] = 1;
+    }
+
+    num_for_rdo = h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)];
+
+    /* è‹¥å½“å‰å±€éƒ¨æœ€ä¼˜çš„ä¸¤ä¸ªæ¨¡å¼æ˜¯MPMä¹‹ä¸€ï¼Œåˆ™å‡å°‘RDOæ¨¡å¼æ•°é‡ */
+    if (p_candidates[0].mode == mpm[0] || p_candidates[0].mode == mpm[1] ||
+        p_candidates[1].mode == mpm[0] || p_candidates[1].mode == mpm[1]) {
+        num_for_rdo = XAVS2_MIN(num_for_rdo, 3);
+        return num_for_rdo;
+    }
+
+    /* ä»ŽMä¸ªæœ€ä¼˜æ¨¡å¼ä¸­é€‰å®šæœ€ç»ˆå‚åŠ RDOçš„æ¨¡å¼ï¼Œå³åŽ»é‡ */
     visited[p_candidates[0].mode] = 2;
     visited[p_candidates[1].mode] = 2;
 
@@ -396,24 +617,48 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can
 /* ---------------------------------------------------------------------------
  * return the best intra prediction mode from the 1st run
  */
-int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
-                                     pel_t *p_fenc, int mpm[], int blockidx,
+int rdo_get_pred_intra_luma8_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                     pel8_t *p_fenc, int mpm[], int blockidx,
                                      int block_x, int block_y, int block_w, int block_h)
 {
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
     int best_intra_mode = p_cu->cu_info.real_intra_modes[blockidx];
-    pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1];
-    pel_t *p_pred = p_enc->intra_pred[best_intra_mode];
+    pel8_t *edge_pixels = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1];
+    pel8_t *p_pred = p_enc->intra8_pred[best_intra_mode];
     int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x;
     int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
 
     /* get edge samples for intra prediction */
-    fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+    fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
 
     UNUSED_PARAMETER(p_fenc);
     UNUSED_PARAMETER(mpm);
 
-    xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h);
+    xavs2_intra_prediction8(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h);
+    p_candidates[0].mode = best_intra_mode;
+    p_candidates[0].cost = 0;
+
+    return 1;
+}
+
+int rdo_get_pred_intra_luma10_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates,
+                                     pel10_t *p_fenc, int mpm[], int blockidx,
+                                     int block_x, int block_y, int block_w, int block_h)
+{
+    cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
+    int best_intra_mode = p_cu->cu_info.real_intra_modes[blockidx];
+    pel10_t *edge_pixels = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1];
+    pel10_t *p_pred = p_enc->intra10_pred[best_intra_mode];
+    int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x;
+    int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y;
+
+    /* get edge samples for intra prediction */
+    fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h);
+
+    UNUSED_PARAMETER(p_fenc);
+    UNUSED_PARAMETER(mpm);
+
+    xavs2_intra_prediction10(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h);
     p_candidates[0].mode = best_intra_mode;
     p_candidates[0].cost = 0;
 
@@ -430,46 +675,113 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_
                                    intra_candidate_t *p_candidate_list)
 {
     cu_parallel_t *p_enc = cu_get_enc_context(h, i_level + 1);
-    pel_t *p_fenc_u = h->lcu.p_fenc[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c;
-    pel_t *p_fenc_v = h->lcu.p_fenc[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_fenc_u = h->lcu.p_fenc8[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c;
+    pel8_t *p_fenc_v = h->lcu.p_fenc8[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c;
+    int blksize = 1 << i_level;
+    pixel8_cmp_t intra_chroma_cost = g_funcs.pixf.intra8_cmp[PART_INDEX(blksize, blksize)];
+    int num_for_rdo = 0;
+
+    int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode
+    pel8_t *EP_u = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 1) - 1];
+    pel8_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
+    int xy = p_cu->in_lcu_edge;
+
+    /* è®¡ç®—Uã€Våˆ†é‡çš„å·¦ä¸Šè§’åƒç´ ç‚¹çš„ä½ç½® */
+    pel8_t *pTL_u = h->lcu.p_fdec8[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    pel8_t *pTL_v = h->lcu.p_fdec8[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    int offset = (FREC_CSTRIDE >> 1);
+    int m;
+
+    /* æ£€æŸ¥è¾¹ç•Œæœ‰æ•ˆæ€§ */
+    uint32_t avail = p_cu->intra_avail;
+
+    /* è®¡ç®—æ¯ä¸ªæ¨¡å¼å·å¯¹åº”çš„é¢„æµ‹æ¨¡å¼ */
+    LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0];
+
+    /* 2.1, èŽ·å–å‚è€ƒè¾¹ç•Œåƒç´  */
+    g_funcs.fill_edge8_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border8[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize);
+    g_funcs.fill_edge8_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border8[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize);
+
+    for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
+        p_candidate_list[m].mode = DM_PRED_C;
+        p_candidate_list[m].cost = MAX_COST;
+    }
+
+    /* 2.2, æ‰§è¡Œé¢„æµ‹ */
+    for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
+        pel8_t *p_pred_u = p_enc->intra8_pred_c[m];
+        pel8_t *p_pred_v = p_enc->intra8_pred_c[m] + offset;
+        rdcost_t est_cost;
+
+        xavs2_intra_prediction8(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
+        xavs2_intra_prediction8(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
+
+        est_cost  = intra_chroma_cost(p_fenc_u, FENC_STRIDE, p_pred_u, FREC_CSTRIDE);
+        est_cost += intra_chroma_cost(p_fenc_v, FENC_STRIDE, p_pred_v, FREC_CSTRIDE);
+
+        update_candidate_list(m, est_cost, NUM_INTRA_MODE_CHROMA, p_candidate_list);
+    }
+
+    if (h->i_type != SLICE_TYPE_I) {
+        num_for_rdo = NUM_INTRA_C_FULL_RD;
+        if (i_level == 6) {
+            num_for_rdo -= 2;
+        } else if (i_level == 5) {
+            num_for_rdo -= 1;
+        }
+    } else {
+        num_for_rdo = NUM_INTRA_MODE_CHROMA;
+    }
+
+    if (p_candidate_list[0].mode == DM_PRED_C) {
+        num_for_rdo = 1;
+    }
+
+    num_for_rdo = XAVS2_MIN(h->num_rdo_intra_chroma, num_for_rdo);
+
+    return num_for_rdo;
+    } else {
+    pel10_t *p_fenc_u = h->lcu.p_fenc10[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c;
+    pel10_t *p_fenc_v = h->lcu.p_fenc10[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c;
     int blksize = 1 << i_level;
-    pixel_cmp_t intra_chroma_cost = g_funcs.pixf.intra_cmp[PART_INDEX(blksize, blksize)];
+    pixel10_cmp_t intra_chroma_cost = g_funcs.pixf.intra10_cmp[PART_INDEX(blksize, blksize)];
     int num_for_rdo = 0;
 
     int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode
-    pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1];
-    pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
+    pel10_t *EP_u = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 1) - 1];
+    pel10_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
     int xy = p_cu->in_lcu_edge;
 
-    /* ¼ÆËãU¡¢V·ÖÁ¿µÄ×óÉÏ½ÇÏñËØµãµÄÎ»ÖÃ */
-    pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
-    pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    /* è®¡ç®—Uã€Våˆ†é‡çš„å·¦ä¸Šè§’åƒç´ ç‚¹çš„ä½ç½® */
+    pel10_t *pTL_u = h->lcu.p_fdec10[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    pel10_t *pTL_v = h->lcu.p_fdec10[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
     int offset = (FREC_CSTRIDE >> 1);
     int m;
 
-    /* ¼ì²é±ß½çÓÐÐ§ÐÔ */
+    /* æ£€æŸ¥è¾¹ç•Œæœ‰æ•ˆæ€§ */
     uint32_t avail = p_cu->intra_avail;
 
-    /* ¼ÆËãÃ¿¸öÄ£Ê½ºÅ¶ÔÓ¦µÄÔ¤²âÄ£Ê½ */
+    /* è®¡ç®—æ¯ä¸ªæ¨¡å¼å·å¯¹åº”çš„é¢„æµ‹æ¨¡å¼ */
     LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0];
 
-    /* 2.1, »ñÈ¡²Î¿¼±ß½çÏñËØ */
-    g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize);
-    g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize);
+    /* 2.1, èŽ·å–å‚è€ƒè¾¹ç•Œåƒç´  */
+    g_funcs.fill_edge10_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border10[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize);
+    g_funcs.fill_edge10_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border10[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize);
 
     for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
         p_candidate_list[m].mode = DM_PRED_C;
         p_candidate_list[m].cost = MAX_COST;
     }
 
-    /* 2.2, Ö´ÐÐÔ¤²â */
+    /* 2.2, æ‰§è¡Œé¢„æµ‹ */
     for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
-        pel_t *p_pred_u = p_enc->intra_pred_c[m];
-        pel_t *p_pred_v = p_enc->intra_pred_c[m] + offset;
+        pel10_t *p_pred_u = p_enc->intra10_pred_c[m];
+        pel10_t *p_pred_v = p_enc->intra10_pred_c[m] + offset;
         rdcost_t est_cost;
 
-        xavs2_intra_prediction(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
-        xavs2_intra_prediction(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
+        xavs2_intra_prediction10(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
+        xavs2_intra_prediction10(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize);
 
         est_cost  = intra_chroma_cost(p_fenc_u, FENC_STRIDE, p_pred_u, FREC_CSTRIDE);
         est_cost += intra_chroma_cost(p_fenc_v, FENC_STRIDE, p_pred_v, FREC_CSTRIDE);
@@ -495,6 +807,7 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_
     num_for_rdo = XAVS2_MIN(h->num_rdo_intra_chroma, num_for_rdo);
 
     return num_for_rdo;
+    }
 }
 //#endif
 
@@ -504,39 +817,75 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_
 int rdo_get_pred_intra_chroma(xavs2_t *h, cu_t *p_cu, int i_level_c, int pix_y_c, int pix_x_c,
                               intra_candidate_t *p_candidate_list)
 {
+    if (h->param->input_sample_bit_depth == 8) {
     int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode
     cu_parallel_t *p_enc = cu_get_enc_context(h, i_level_c + 1);
-    pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1];
-    pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
+    pel8_t *EP_u = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 1) - 1];
+    pel8_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
     int bsize   = 1 << i_level_c;
     int xy = p_cu->in_lcu_edge;
 
-    /* ¼ÆËãU¡¢V·ÖÁ¿µÄ×óÉÏ½ÇÏñËØµãµÄÎ»ÖÃ */
-    pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
-    pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    /* è®¡ç®—Uã€Våˆ†é‡çš„å·¦ä¸Šè§’åƒç´ ç‚¹çš„ä½ç½® */
+    pel8_t *pTL_u = h->lcu.p_fdec8[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    pel8_t *pTL_v = h->lcu.p_fdec8[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
     int offset = (FREC_CSTRIDE >> 1);
     int m;
 
-    /* ¼ì²é±ß½çÓÐÐ§ÐÔ */
+    /* æ£€æŸ¥è¾¹ç•Œæœ‰æ•ˆæ€§ */
     uint32_t avail = p_cu->intra_avail;
 
-    /* ¼ÆËãÃ¿¸öÄ£Ê½ºÅ¶ÔÓ¦µÄÔ¤²âÄ£Ê½ */
+    /* è®¡ç®—æ¯ä¸ªæ¨¡å¼å·å¯¹åº”çš„é¢„æµ‹æ¨¡å¼ */
     LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0];
 
-    /* 2.1, »ñÈ¡²Î¿¼±ß½çÏñËØ */
-    g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize);
-    g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize);
+    /* 2.1, èŽ·å–å‚è€ƒè¾¹ç•Œåƒç´  */
+    g_funcs.fill_edge8_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border8[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize);
+    g_funcs.fill_edge8_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border8[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize);
 
-    /* 2.2, Ö´ÐÐÔ¤²â */
+    /* 2.2, æ‰§è¡Œé¢„æµ‹ */
     for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
-        xavs2_intra_prediction(h, EP_u, p_enc->intra_pred_c[m] + 0,      FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
-        xavs2_intra_prediction(h, EP_v, p_enc->intra_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
+        xavs2_intra_prediction8(h, EP_u, p_enc->intra8_pred_c[m] + 0,      FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
+        xavs2_intra_prediction8(h, EP_v, p_enc->intra8_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
 
         p_candidate_list[m].mode = m;
         p_candidate_list[m].cost = MAX_COST;
     }
 
     return NUM_INTRA_MODE_CHROMA;
+    } else {
+    int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode
+    cu_parallel_t *p_enc = cu_get_enc_context(h, i_level_c + 1);
+    pel10_t *EP_u = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 1) - 1];
+    pel10_t *EP_v = EP_u + (MAX_CU_SIZE << 2);
+    int bsize   = 1 << i_level_c;
+    int xy = p_cu->in_lcu_edge;
+
+    /* è®¡ç®—Uã€Våˆ†é‡çš„å·¦ä¸Šè§’åƒç´ ç‚¹çš„ä½ç½® */
+    pel10_t *pTL_u = h->lcu.p_fdec10[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    pel10_t *pTL_v = h->lcu.p_fdec10[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1;
+    int offset = (FREC_CSTRIDE >> 1);
+    int m;
+
+    /* æ£€æŸ¥è¾¹ç•Œæœ‰æ•ˆæ€§ */
+    uint32_t avail = p_cu->intra_avail;
+
+    /* è®¡ç®—æ¯ä¸ªæ¨¡å¼å·å¯¹åº”çš„é¢„æµ‹æ¨¡å¼ */
+    LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0];
+
+    /* 2.1, èŽ·å–å‚è€ƒè¾¹ç•Œåƒç´  */
+    g_funcs.fill_edge10_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border10[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize);
+    g_funcs.fill_edge10_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border10[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize);
+
+    /* 2.2, æ‰§è¡Œé¢„æµ‹ */
+    for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) {
+        xavs2_intra_prediction10(h, EP_u, p_enc->intra10_pred_c[m] + 0,      FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
+        xavs2_intra_prediction10(h, EP_v, p_enc->intra10_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize);
+
+        p_candidate_list[m].mode = m;
+        p_candidate_list[m].cost = MAX_COST;
+    }
+
+    return NUM_INTRA_MODE_CHROMA;
+    }
 }
 
 /* ---------------------------------------------------------------------------
diff --git a/source/encoder/me.c b/source/encoder/me.c
index db3dea7..f79ff86 100644
--- a/source/encoder/me.c
+++ b/source/encoder/me.c
@@ -125,7 +125,7 @@ static int8_t GRID[24][2] = {
 };
 
 /* ---------------------------------------------------------------------------
- * ÓÃÓÚ·ÖÏñËØËÑË÷µÄÕý·½ÐÎËÑË÷ */
+ * ç”¨äºŽåˆ†åƒç´ æœç´¢çš„æ­£æ–¹å½¢æœç´¢ */
 static const int8_t Spiral[9][2] = {
     {  0,  0 }, {  0, -1 }, {  0, 1 },
     { -1, -1 }, {  1, -1 }, { -1, 0 },
@@ -162,11 +162,18 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  * early termination */
-#define EARLY_TERMINATION(pred_sad) \
+#define EARLY_TERMINATION8(pred_sad) \
     if (bcost < (pred_sad) * beta3) {\
-        goto umh_step_3;\
+        goto umh_step8_3;\
     } else if (bcost < (pred_sad) * beta2) {\
-        goto umh_step_2;\
+        goto umh_step8_2;\
+    }
+
+#define EARLY_TERMINATION10(pred_sad) \
+    if (bcost < (pred_sad) * beta3) {\
+        goto umh_step10_3;\
+    } else if (bcost < (pred_sad) * beta2) {\
+        goto umh_step10_2;\
     }
 
 
@@ -178,34 +185,67 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  */
-#define CAL_COST_IPEL(mx, my) \
-    g_funcs.pixf.sad[i_pixel](p_org, i_org,\
+#define CAL_COST_IPEL8(mx, my) \
+    g_funcs.pixf.sad8[i_pixel](p_org, i_org,\
+        p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my)
+
+#define CAL_COST_IPEL10(mx, my) \
+    g_funcs.pixf.sad10[i_pixel](p_org, i_org,\
         p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my)
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL(mx, my) \
+#define ME_COST_IPEL8(mx, my) \
+    if (CHECK_MV_RANGE(mx, my)) {\
+        int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\
+                   p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
+        COPY3_IF_LT(bcost, cost, bmx, mx, bmy, my);\
+    }
+#define ME_COST_IPEL10(mx, my) \
     if (CHECK_MV_RANGE(mx, my)) {\
-        int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\
+        int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\
                    p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
         COPY3_IF_LT(bcost, cost, bmx, mx, bmy, my);\
     }
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL_DIR(mx, my, d) \
+#define ME_COST_IPEL8_DIR(mx, my, d) \
+    if (CHECK_MV_RANGE(mx, my)) {\
+        int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\
+                   p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
+        COPY4_IF_LT(bcost, cost, bmx, mx, bmy, my, dir, d);\
+    }
+
+#define ME_COST_IPEL10_DIR(mx, my, d) \
     if (CHECK_MV_RANGE(mx, my)) {\
-        int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\
+        int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\
                    p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
         COPY4_IF_LT(bcost, cost, bmx, mx, bmy, my, dir, d);\
     }
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL_X3(m0x, m0y, m1x, m1y, m2x, m2y) \
+#define ME_COST_IPEL8_X3(m0x, m0y, m1x, m1y, m2x, m2y) \
+{\
+    pel8_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad8_x3[i_pixel](p_org,\
+        pix_base + (m0y) * i_fref + (m0x),\
+        pix_base + (m1y) * i_fref + (m1x),\
+        pix_base + (m2y) * i_fref + (m2x),\
+        i_fref, costs);\
+    costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\
+    costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\
+    costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\
+    COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\
+    COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\
+    COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\
+}
+
+#define ME_COST_IPEL10_X3(m0x, m0y, m1x, m1y, m2x, m2y) \
 {\
-    pel_t *pix_base = p_fref + omy * i_fref + omx;\
-    g_funcs.pixf.sad_x3[i_pixel](p_org,\
+    pel10_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad10_x3[i_pixel](p_org,\
         pix_base + (m0y) * i_fref + (m0x),\
         pix_base + (m1y) * i_fref + (m1x),\
         pix_base + (m2y) * i_fref + (m2x),\
@@ -220,10 +260,26 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \
+#define ME_COST_IPEL8_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \
+{\
+    pel8_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad8_x3[i_pixel](p_org,\
+        pix_base + (m0y) * i_fref + (m0x),\
+        pix_base + (m1y) * i_fref + (m1x),\
+        pix_base + (m2y) * i_fref + (m2x),\
+        i_fref, costs);\
+    costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\
+    costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\
+    costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\
+    COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\
+    COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\
+    COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\
+}
+
+#define ME_COST_IPEL10_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \
 {\
-    pel_t *pix_base = p_fref + omy * i_fref + omx;\
-    g_funcs.pixf.sad_x3[i_pixel](p_org,\
+    pel10_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad10_x3[i_pixel](p_org,\
         pix_base + (m0y) * i_fref + (m0x),\
         pix_base + (m1y) * i_fref + (m1x),\
         pix_base + (m2y) * i_fref + (m2x),\
@@ -238,11 +294,37 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
+#define ME_COST_IPEL8_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
+{\
+    if (CHECK_MV_RANGE_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y)) {  \
+        pel8_t *pix_base = p_fref + omy * i_fref + omx;\
+        g_funcs.pixf.sad8_x4[i_pixel](p_org,\
+            pix_base + (m0y) * i_fref + (m0x),\
+            pix_base + (m1y) * i_fref + (m1x),\
+            pix_base + (m2y) * i_fref + (m2x),\
+            pix_base + (m3y) * i_fref + (m3x),\
+            i_fref, costs);\
+        costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\
+        costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\
+        costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\
+        costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\
+        COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\
+        COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\
+        COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\
+        COPY3_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y));\
+    } else {                    \
+        ME_COST_IPEL8(m0x, m0y); \
+        ME_COST_IPEL8(m1x, m1y); \
+        ME_COST_IPEL8(m2x, m2y); \
+        ME_COST_IPEL8(m3x, m3y); \
+    } \
+}
+
+#define ME_COST_IPEL10_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
 {\
     if (CHECK_MV_RANGE_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y)) {  \
-        pel_t *pix_base = p_fref + omy * i_fref + omx;\
-        g_funcs.pixf.sad_x4[i_pixel](p_org,\
+        pel10_t *pix_base = p_fref + omy * i_fref + omx;\
+        g_funcs.pixf.sad10_x4[i_pixel](p_org,\
             pix_base + (m0y) * i_fref + (m0x),\
             pix_base + (m1y) * i_fref + (m1x),\
             pix_base + (m2y) * i_fref + (m2x),\
@@ -257,19 +339,37 @@ static const int i_org = FENC_STRIDE;
         COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\
         COPY3_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y));\
     } else {                    \
-        ME_COST_IPEL(m0x, m0y); \
-        ME_COST_IPEL(m1x, m1y); \
-        ME_COST_IPEL(m2x, m2y); \
-        ME_COST_IPEL(m3x, m3y); \
+        ME_COST_IPEL10(m0x, m0y); \
+        ME_COST_IPEL10(m1x, m1y); \
+        ME_COST_IPEL10(m2x, m2y); \
+        ME_COST_IPEL10(m3x, m3y); \
     } \
 }
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_IPEL_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \
+#define ME_COST_IPEL8_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \
+{\
+    pel8_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad8_x4[i_pixel](p_org,\
+        pix_base + (m0y) * i_fref + (m0x),\
+        pix_base + (m1y) * i_fref + (m1x),\
+        pix_base + (m2y) * i_fref + (m2x),\
+        pix_base + (m3y) * i_fref + (m3x), i_fref, costs);\
+    costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\
+    costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\
+    costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\
+    costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\
+    COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\
+    COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\
+    COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\
+    COPY4_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y), dir, d3);\
+}
+
+#define ME_COST_IPEL10_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \
 {\
-    pel_t *pix_base = p_fref + omy * i_fref + omx;\
-    g_funcs.pixf.sad_x4[i_pixel](p_org,\
+    pel10_t *pix_base = p_fref + omy * i_fref + omx;\
+    g_funcs.pixf.sad10_x4[i_pixel](p_org,\
         pix_base + (m0y) * i_fref + (m0x),\
         pix_base + (m1y) * i_fref + (m1x),\
         pix_base + (m2y) * i_fref + (m2x),\
@@ -286,18 +386,51 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  * for TZ */
-#define ME_COST_IPEL_DIR_DIST(mx, my, direction, dist) \
+#define ME_COST_IPEL8_DIR_DIST(mx, my, direction, dist) \
+    if (CHECK_MV_RANGE(mx, my)) {\
+        int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\
+                   p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
+        COPY5_IF_LT(mv->bcost, cost, mv->bmx, mx, mv->bmy, my, mv->bdir, direction, mv->bdist, dist);\
+    }
+
+#define ME_COST_IPEL10_DIR_DIST(mx, my, direction, dist) \
     if (CHECK_MV_RANGE(mx, my)) {\
-        int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\
+        int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\
                    p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\
         COPY5_IF_LT(mv->bcost, cost, mv->bmx, mx, mv->bmy, my, mv->bdir, direction, mv->bdist, dist);\
     }
 
 /* ---------------------------------------------------------------------------
  * for TZ */
-#define ME_COST_IPEL_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
+#define ME_COST_IPEL8_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
+{\
+    g_funcs.pixf.sad8_x4[i_pixel](p_org,\
+        p_fref + (m0x) + (m0y) * i_fref,\
+        p_fref + (m1x) + (m1y) * i_fref,\
+        p_fref + (m2x) + (m2y) * i_fref,\
+        p_fref + (m3x) + (m3y) * i_fref,\
+        i_fref, costs);\
+    (costs)[0] += MV_COST_IPEL(m0x, m0y);\
+    (costs)[1] += MV_COST_IPEL(m1x, m1y);\
+    (costs)[2] += MV_COST_IPEL(m2x, m2y);\
+    (costs)[3] += MV_COST_IPEL(m3x, m3y);\
+    if (CHECK_MV_RANGE(m0x,m0y)) {\
+        COPY5_IF_LT(mv->bcost, costs[0], mv->bmx, m0x, mv->bmy, m0y, mv->bdir, p0, mv->bdist, d0);\
+    }\
+    if (CHECK_MV_RANGE(m1x,m1y)) {\
+        COPY5_IF_LT(mv->bcost, costs[1], mv->bmx, m1x, mv->bmy, m1y, mv->bdir, p1, mv->bdist, d1);\
+    }\
+    if (CHECK_MV_RANGE(m2x,m2y)) {\
+        COPY5_IF_LT(mv->bcost, costs[2], mv->bmx, m2x, mv->bmy, m2y, mv->bdir, p2, mv->bdist, d2);\
+    }\
+    if (CHECK_MV_RANGE(m3x,m3y)) {\
+        COPY5_IF_LT(mv->bcost, costs[3], mv->bmx, m3x, mv->bmy, m3y, mv->bdir, p3, mv->bdist, d3);\
+    }\
+}
+
+#define ME_COST_IPEL10_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
 {\
-    g_funcs.pixf.sad_x4[i_pixel](p_org,\
+    g_funcs.pixf.sad10_x4[i_pixel](p_org,\
         p_fref + (m0x) + (m0y) * i_fref,\
         p_fref + (m1x) + (m1y) * i_fref,\
         p_fref + (m2x) + (m2y) * i_fref,\
@@ -325,11 +458,18 @@ static const int i_org = FENC_STRIDE;
  * diamond:     1
  *            1 0 1
  *              1    */
-#define DIA_ITER(mx, my) \
+#define DIA_ITER8(mx, my) \
 {\
     omx = mx;\
     omy = my;\
-    ME_COST_IPEL_X4(0,-1, -1,0, 1,0, 0,1);\
+    ME_COST_IPEL8_X4(0,-1, -1,0, 1,0, 0,1);\
+}
+
+#define DIA_ITER10(mx, my) \
+{\
+    omx = mx;\
+    omy = my;\
+    ME_COST_IPEL10_X4(0,-1, -1,0, 1,0, 0,1);\
 }
 
 
@@ -341,16 +481,56 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_QPEL(mx, my) \
+#define ME_COST_QPEL8(mx, my) \
+{\
+    pel8_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\
+                  + ((my) >> 2) * i_fref + ((mx) >> 2); \
+    cost = g_funcs.pixf.fpel8_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\
+}
+
+#define ME_COST_QPEL10(mx, my) \
 {\
-    pel_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\
+    pel10_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\
                   + ((my) >> 2) * i_fref + ((mx) >> 2); \
-    cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\
+    cost = g_funcs.pixf.fpel10_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\
 }
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_QPEL_SYM \
+#define ME_COST_QPEL8_SYM \
+{\
+    int mx_sym;\
+    int my_sym;\
+    \
+    cost = MAX_DISTORTION;\
+    if (h->i_type == SLICE_TYPE_B) {\
+        mx_sym = -scale_mv_skip  (   mx, distance_bwd, distance_fwd);\
+        my_sym = -scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\
+    } else {\
+        mx_sym = scale_mv_skip  (   mx, distance_bwd, distance_fwd);\
+        my_sym = scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\
+    }\
+    \
+    if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_sym, my_sym)) {\
+        int xx1 = mx     >> 2;\
+        int yy1 = my     >> 2;\
+        int xx2 = mx_sym >> 2;\
+        int yy2 = my_sym >> 2;\
+        pel8_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)]; \
+        pel8_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \
+        pel8_t *p_pred = buf_pixel_temp;\
+        \
+        if (p_src1 != NULL && p_src2 != NULL) { \
+            p_src1 += i_offset + yy1 * i_fref + xx1;\
+            p_src2 += i_offset + yy2 * i_fref + xx2;\
+            g_funcs.pixf.avg8[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \
+            cost = g_funcs.pixf.fpel8_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\
+                 + MV_COST_FPEL(mx, my);\
+        } \
+    }\
+}
+
+#define ME_COST_QPEL10_SYM \
 {\
     int mx_sym;\
     int my_sym;\
@@ -369,15 +549,15 @@ static const int i_org = FENC_STRIDE;
         int yy1 = my     >> 2;\
         int xx2 = mx_sym >> 2;\
         int yy2 = my_sym >> 2;\
-        pel_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)]; \
-        pel_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \
-        pel_t *p_pred = buf_pixel_temp;\
+        pel10_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)]; \
+        pel10_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \
+        pel10_t *p_pred = buf_pixel_temp;\
         \
         if (p_src1 != NULL && p_src2 != NULL) { \
             p_src1 += i_offset + yy1 * i_fref + xx1;\
             p_src2 += i_offset + yy2 * i_fref + xx2;\
-            g_funcs.pixf.avg[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \
-            cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\
+            g_funcs.pixf.avg10[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \
+            cost = g_funcs.pixf.fpel10_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\
                  + MV_COST_FPEL(mx, my);\
         } \
     }\
@@ -385,12 +565,24 @@ static const int i_org = FENC_STRIDE;
 
 /* ---------------------------------------------------------------------------
  */
-#define ME_COST_QPEL_BID \
+#define ME_COST_QPEL8_BID \
+    if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_bid, my_bid)) {\
+        int xx1 = mx     >> 2;\
+        int yy1 = my     >> 2;\
+        pel8_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)] + i_offset + yy1 * i_fref + xx1;\
+        int distortion = g_funcs.pixf.fpel8_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\
+        \
+        cost = distortion + MV_COST_FPEL(mx, my) + mv_bid_bit;\
+    } else {\
+        cost = MAX_DISTORTION;\
+    }
+
+#define ME_COST_QPEL10_BID \
     if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_bid, my_bid)) {\
         int xx1 = mx     >> 2;\
         int yy1 = my     >> 2;\
-        pel_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)] + i_offset + yy1 * i_fref + xx1;\
-        int distortion = g_funcs.pixf.fpel_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\
+        pel10_t *p_src1 = p_filtered1[((my     & 3) << 2) + (mx     & 3)] + i_offset + yy1 * i_fref + xx1;\
+        int distortion = g_funcs.pixf.fpel10_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\
         \
         cost = distortion + MV_COST_FPEL(mx, my) + mv_bid_bit;\
     } else {\
@@ -405,9 +597,9 @@ static const int i_org = FENC_STRIDE;
  */
 
 /* ---------------------------------------------------------------------------
- * ÈôcandMV³¬¹ý1/4¾«¶ÈµÄãÐÖµ£¬ÔòÐÂµÄMVÓ¦²ÉÓÃ2±¶²½³¤£¬Èô´ËÊ±ÐÂµÄMVÔÚãÐÖµ·¶Î§ÄÚ£¬Ôò·µ»Ø1£¬±íÊ¾ÐÂµÄMV²»Ó¦¼ÌÐøËÑË÷
- * ÈôcandMVÔÚ1/4¾«¶ÈãÐÖµ·¶Î§ÄÚ£¬ÔòÐÂµÄMV²ÉÓÃµ¥±¶²½³¤£¬´ËÊ±ÈôÐÂMV³¬¹ýãÐÖµ·¶Î§£¬Ôò·µ»Ø1£¬±íÊ¾ÐÂµÄMV²»Ó¦¼ÌÐøËÑË÷
- * ·ñÔò£¬·µ»Ø0Öµ±íÊ¾ÐÂµÄMVÓ¦¸Ã¼ÌÐø±»ËÑË÷
+ * è‹¥candMVè¶…è¿‡1/4ç²¾åº¦çš„é˜ˆå€¼ï¼Œåˆ™æ–°çš„MVåº”é‡‡ç”¨2å€æ­¥é•¿ï¼Œè‹¥æ­¤æ—¶æ–°çš„MVåœ¨é˜ˆå€¼èŒƒå›´å†…ï¼Œåˆ™è¿”å›ž1ï¼Œè¡¨ç¤ºæ–°çš„MVä¸åº”ç»§ç»­æœç´¢
+ * è‹¥candMVåœ¨1/4ç²¾åº¦é˜ˆå€¼èŒƒå›´å†…ï¼Œåˆ™æ–°çš„MVé‡‡ç”¨å•å€æ­¥é•¿ï¼Œæ­¤æ—¶è‹¥æ–°MVè¶…è¿‡é˜ˆå€¼èŒƒå›´ï¼Œåˆ™è¿”å›ž1ï¼Œè¡¨ç¤ºæ–°çš„MVä¸åº”ç»§ç»­æœç´¢
+ * å¦åˆ™ï¼Œè¿”å›ž0å€¼è¡¨ç¤ºæ–°çš„MVåº”è¯¥ç»§ç»­è¢«æœç´¢
  */
 static int pmvr_adapt_mv(int *mx, int *my, int ctr_x, int ctr_y,
                          int mv_x, int mv_y, int step_x, int step_y)
@@ -479,11 +671,6 @@ mv_clip(int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_min[2], int mv_m
 static dist_t
 me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me)
 {
-#if !ENABLE_FRAME_SUBPEL_INTPL
-    ALIGN32(pel_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]);
-#endif
-    pel_t  *p_org     = p_me->p_fenc;
-    pel_t **p_filtered = p_me->p_fref_1st->filtered;
     int i_fref   = p_me->p_fref_1st->i_stride[IMG_Y];
     int pmx      = p_me->mvp.x;
     int pmy      = p_me->mvp.y;
@@ -508,8 +695,88 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me)
     bmy = p_me->bmv.y;
     bmv = p_me->bmv;
 
+    if (h->param->input_sample_bit_depth == 8) {
+#if !ENABLE_FRAME_SUBPEL_INTPL
+    ALIGN32(pel8_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]);
+#endif
+    pel8_t  *p_org     = p_me->p_fenc8;
+    pel8_t **p_filtered = p_me->p_fref_1st->filtered8;
+    if (h->param->enable_hadamard) {
+        ME_COST_QPEL8(bmx, bmy);
+        bcost = cost;
+    } else {
+        bcost = p_me->bcost;
+    }
+
+    /* -------------------------------------------------------------
+     * half-pel refine */
+
+    // loop over search positions
+    for (pos = 1; pos < search_pos2; pos += search_step) {
+        mx = bmx + (search_pattern[pos][0] << 1);
+        my = bmy + (search_pattern[pos][1] << 1);
+#if ENABLE_FRAME_SUBPEL_INTPL
+        ME_COST_QPEL8(mx, my);
+#else
+        mv_t mvt;
+        mvt.v = MAKEDWORD(mx, my);
+        get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h);
+        mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st);
+        cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my);
+#endif
+        if (cost < bcost) {
+            bcost = cost;
+            bmv.v = MAKEDWORD(mx, my);
+        }
+    }
+
+    bmx = bmv.x;
+    bmy = bmv.y;
+
+    /* -------------------------------------------------------------
+     * quarter-pel refine */
+
+    if (h->use_fractional_me > 1) {
+        // loop over search positions
+        for (pos = 1; pos < search_pos4; pos += search_step) {
+            if (h->param->enable_pmvr) {
+                if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, bmx, bmy, search_pattern[pos][0], search_pattern[pos][1])) {
+                    continue;
+                }
+            } else {
+                mx = bmx + search_pattern[pos][0];    // quarter-pel units
+                my = bmy + search_pattern[pos][1];    // quarter-pel units
+            }
+
+            // set motion vector cost
+#if ENABLE_FRAME_SUBPEL_INTPL
+            ME_COST_QPEL8(mx, my);
+#else
+            mv_t mvt;
+            mvt.v = MAKEDWORD(mx, my);
+            get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h);
+            mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st);
+            cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my);
+#endif
+            if (cost < bcost) {
+                bcost = cost;
+                bmv.v = MAKEDWORD(mx, my);
+            }
+        }
+    }
+    // save the results
+    p_me->bmv   = bmv;
+    p_me->bcost = bcost;
+    p_me->mvcost[PDIR_FWD] = MV_COST_FPEL(bmv.x,bmv.y);
+    return bcost;
+    } else {
+#if !ENABLE_FRAME_SUBPEL_INTPL
+    ALIGN32(pel10_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]);
+#endif
+    pel10_t  *p_org     = p_me->p_fenc10;
+    pel10_t **p_filtered = p_me->p_fref_1st->filtered10;
     if (h->param->enable_hadamard) {
-        ME_COST_QPEL(bmx, bmy);
+        ME_COST_QPEL10(bmx, bmy);
         bcost = cost;
     } else {
         bcost = p_me->bcost;
@@ -523,7 +790,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me)
         mx = bmx + (search_pattern[pos][0] << 1);
         my = bmy + (search_pattern[pos][1] << 1);
 #if ENABLE_FRAME_SUBPEL_INTPL
-        ME_COST_QPEL(mx, my);
+        ME_COST_QPEL10(mx, my);
 #else
         mv_t mvt;
         mvt.v = MAKEDWORD(mx, my);
@@ -557,7 +824,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me)
 
             // set motion vector cost
 #if ENABLE_FRAME_SUBPEL_INTPL
-            ME_COST_QPEL(mx, my);
+            ME_COST_QPEL10(mx, my);
 #else
             mv_t mvt;
             mvt.v = MAKEDWORD(mx, my);
@@ -576,6 +843,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me)
     p_me->bcost = bcost;
     p_me->mvcost[PDIR_FWD] = MV_COST_FPEL(bmv.x,bmv.y);
     return bcost;
+    }
 }
 
 
@@ -666,10 +934,10 @@ void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp)
 
 /* ---------------------------------------------------------------------------
  */
-static void tz_pattern_search(xavs2_t* h,
+static void tz_pattern_search8(xavs2_t* h,
                               xavs2_me_t *p_me,
-                              pel_t*    p_org,
-                              pel_t*    p_fref,
+                              pel8_t*    p_org,
+                              pel8_t*    p_fref,
                               mv_info*  mv,
                               int       mv_x_min,
                               int       mv_y_min,
@@ -700,22 +968,22 @@ static void tz_pattern_search(xavs2_t* h,
     int idx;
 
     if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) {
-        ME_COST_IPEL_X4_DIR_DIST(omx,   top,    2, dist,      /* direction */
+        ME_COST_IPEL8_X4_DIR_DIST(omx,   top,    2, dist,      /* direction */
                                  left,  omy,    4, dist,      /*     2     */
                                  right, omy,    5, dist,      /*   4 * 5   */
                                  omx,   bottom, 7, dist);     /*     7     */
     } else {
         if (top >= mv_y_min) {        // check top
-            ME_COST_IPEL_DIR_DIST(omx, top, 2, dist);
+            ME_COST_IPEL8_DIR_DIST(omx, top, 2, dist);
         }
         if (left >= mv_x_min) {       // check middle left
-            ME_COST_IPEL_DIR_DIST(left, omy, 4, dist);
+            ME_COST_IPEL8_DIR_DIST(left, omy, 4, dist);
         }
         if (right <= mv_x_max) {      // check middle right
-            ME_COST_IPEL_DIR_DIST(right, omy, 5, dist);
+            ME_COST_IPEL8_DIR_DIST(right, omy, 5, dist);
         }
         if (bottom <= mv_y_max) {     // check bottom
-            ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist);
+            ME_COST_IPEL8_DIR_DIST(omx, bottom, 7, dist);
         }
     }
 
@@ -745,42 +1013,42 @@ static void tz_pattern_search(xavs2_t* h,
 
         // check border
         if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) {
-            ME_COST_IPEL_X4_DIR_DIST(omx,    top,     2, dist,
+            ME_COST_IPEL8_X4_DIR_DIST(omx,    top,     2, dist,
                                      left2,  top2,    1, dist >> 1,
                                      right2, top2,    3, dist >> 1,
                                      left,   omy,     4, dist);
-            ME_COST_IPEL_X4_DIR_DIST(right,  omy,     5, dist,
+            ME_COST_IPEL8_X4_DIR_DIST(right,  omy,     5, dist,
                                      left2,  bottom2, 6, dist >> 1,
                                      right2, bottom2, 8, dist >> 1,
                                      omx,    bottom,  7, dist);
         } else {
             if (top >= mv_y_min) {              // check top
-                ME_COST_IPEL_DIR_DIST(omx, top, 2, dist);
+                ME_COST_IPEL8_DIR_DIST(omx, top, 2, dist);
             }
             if (top2 >= mv_y_min) {             // check half top
                 if (left2 >= mv_x_min) {        // check half left
-                    ME_COST_IPEL_DIR_DIST(left2, top2, 1, (dist >> 1));
+                    ME_COST_IPEL8_DIR_DIST(left2, top2, 1, (dist >> 1));
                 }
                 if (right2 <= mv_x_max) {       // check half right
-                    ME_COST_IPEL_DIR_DIST(right2, top2, 3, (dist >> 1));
+                    ME_COST_IPEL8_DIR_DIST(right2, top2, 3, (dist >> 1));
                 }
             }
             if (left >= mv_x_min) {             // check left
-                ME_COST_IPEL_DIR_DIST(left, omy, 4, dist);
+                ME_COST_IPEL8_DIR_DIST(left, omy, 4, dist);
             }
             if (right <= mv_x_max) {            // check right
-                ME_COST_IPEL_DIR_DIST(right, omy, 5, dist);
+                ME_COST_IPEL8_DIR_DIST(right, omy, 5, dist);
             }
             if (bottom2 <= mv_y_max) {          // check half bottom
                 if (left2 >= mv_x_min) {        // check half left
-                    ME_COST_IPEL_DIR_DIST(left2, bottom2, 6, (dist >> 1));
+                    ME_COST_IPEL8_DIR_DIST(left2, bottom2, 6, (dist >> 1));
                 }
                 if (right2 <= mv_x_max) {       // check half right
-                    ME_COST_IPEL_DIR_DIST(right2, bottom2, 8, (dist >> 1));
+                    ME_COST_IPEL8_DIR_DIST(right2, bottom2, 8, (dist >> 1));
                 }
             }
             if (bottom <= mv_y_max) {           // check bottom
-                ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist);
+                ME_COST_IPEL8_DIR_DIST(omx, bottom, 7, dist);
             }
         }
         if (mv->bcost < bcost) {
@@ -809,7 +1077,7 @@ static void tz_pattern_search(xavs2_t* h,
              *               2
              *               3
              *               0                  */
-            ME_COST_IPEL_X4_DIR_DIST(omx,   top,    0, dist,
+            ME_COST_IPEL8_X4_DIR_DIST(omx,   top,    0, dist,
                                      left,  omy,    0, dist,
                                      right, omy,    0, dist,
                                      omx,   bottom, 0, dist);
@@ -818,7 +1086,7 @@ static void tz_pattern_search(xavs2_t* h,
                 posYB = bottom - ((dist >> 2) * idx);
                 posXL = omx    - ((dist >> 2) * idx);
                 posXR = omx    + ((dist >> 2) * idx);
-                ME_COST_IPEL_X4_DIR_DIST(posXL, posYT, 0, dist,
+                ME_COST_IPEL8_X4_DIR_DIST(posXL, posYT, 0, dist,
                                          posXR, posYT, 0, dist,
                                          posXL, posYB, 0, dist,
                                          posXR, posYB, 0, dist);
@@ -826,16 +1094,16 @@ static void tz_pattern_search(xavs2_t* h,
         } else {
             // check border for each mv
             if (top >= mv_y_min) {              // check top
-                ME_COST_IPEL_DIR_DIST(omx, top, 0, dist);
+                ME_COST_IPEL8_DIR_DIST(omx, top, 0, dist);
             }
             if (left >= mv_x_min) {             // check left
-                ME_COST_IPEL_DIR_DIST(left, omy, 0, dist);
+                ME_COST_IPEL8_DIR_DIST(left, omy, 0, dist);
             }
             if (right <= mv_x_max) {            // check right
-                ME_COST_IPEL_DIR_DIST(right, omy, 0, dist);
+                ME_COST_IPEL8_DIR_DIST(right, omy, 0, dist);
             }
             if (bottom <= mv_y_max) {           // check bottom
-                ME_COST_IPEL_DIR_DIST(omx, bottom, 0, dist);
+                ME_COST_IPEL8_DIR_DIST(omx, bottom, 0, dist);
             }
 
             for (idx = 1; idx < 4; idx++) {
@@ -846,18 +1114,18 @@ static void tz_pattern_search(xavs2_t* h,
 
                 if (posYT >= mv_y_min) {        // check top
                     if (posXL >= mv_x_min) {    // check left
-                        ME_COST_IPEL_DIR_DIST(posXL, posYT, 0, dist);
+                        ME_COST_IPEL8_DIR_DIST(posXL, posYT, 0, dist);
                     }
                     if (posXR <= mv_x_max) {    // check right
-                        ME_COST_IPEL_DIR_DIST(posXR, posYT, 0, dist);
+                        ME_COST_IPEL8_DIR_DIST(posXR, posYT, 0, dist);
                     }
                 }
                 if (posYB <= mv_y_max) {        // check bottom
                     if (posXL >= mv_x_min) {    // check left
-                        ME_COST_IPEL_DIR_DIST(posXL, posYB, 0, dist);
+                        ME_COST_IPEL8_DIR_DIST(posXL, posYB, 0, dist);
                     }
                     if (posXR <= mv_x_max) {    // check right
-                        ME_COST_IPEL_DIR_DIST(posXR, posYB, 0, dist);
+                        ME_COST_IPEL8_DIR_DIST(posXR, posYB, 0, dist);
                     }
                 }
             }
@@ -871,57 +1139,538 @@ static void tz_pattern_search(xavs2_t* h,
     }
 }
 
-// int g_me_time[4] = { 0 };
-
-/* ---------------------------------------------------------------------------
- * return minimum motion cost after search
- */
-dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc)
+static void tz_pattern_search10(xavs2_t* h,
+                              xavs2_me_t *p_me,
+                              pel10_t*    p_org,
+                              pel10_t*    p_fref,
+                              mv_info*  mv,
+                              int       mv_x_min,
+                              int       mv_y_min,
+                              int       mv_x_max,
+                              int       mv_y_max,
+                              int       i_pixel,
+                              int       i_fref,
+                              int       earlyExitIters,
+                              int       merange)
 {
-    /* special version of pack to allow shortcuts in CHECK_MV_RANGE */
-    ALIGNED_ARRAY_16(int, costs,[8]);
-    double beta2  = p_me->beta2 + 1;
-    double beta3  = p_me->beta3 + 1;
-    pel_t *p_org = p_me->p_fenc;
-    pel_t *p_fref = p_me->p_fref_1st->planes[IMG_Y] + p_me->i_bias;
-    int i_fref    = p_me->p_fref_1st->i_stride[IMG_Y];
-    int i_pixel   = p_me->i_pixel;
-    int mv_x_min  = p_me->mv_min_fpel[0];
-    int mv_y_min  = p_me->mv_min_fpel[1];
-    int mv_x_max  = p_me->mv_max_fpel[0];
-    int mv_y_max  = p_me->mv_max_fpel[1];
-    int me_range  = h->param->search_range;
-    int lambda    = h->i_lambda_factor; // factor for determining Lagrangian's motion cost
+    ALIGN16(int costs[16]);
     const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min);
     const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000;
     const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x;
     const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y;
-    uint32_t pmv;
-    dist_t bcost = MAX_DISTORTION;
-    int bmx = 0, bmy = 0;
-    int omx, omy;
-    int i, j, dir, idx;
+    int lambda = h->i_lambda_factor;
+    int rounds = 0;
+    int dist   = 1;
+    int omx    = mv->bmx;
+    int omy    = mv->bmx;
+    dist_t bcost  = mv->bcost;
+    int top    = omy - dist;
+    int bottom = omy + dist;
+    int left   = omx - dist;
+    int right  = omx + dist;
+    int top2, bottom2, left2, right2;
+    int posYT, posYB, posXL, posXR;
+    int idx;
+
+    if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) {
+        ME_COST_IPEL10_X4_DIR_DIST(omx,   top,    2, dist,      /* direction */
+                                 left,  omy,    4, dist,      /*     2     */
+                                 right, omy,    5, dist,      /*   4 * 5   */
+                                 omx,   bottom, 7, dist);     /*     7     */
+    } else {
+        if (top >= mv_y_min) {        // check top
+            ME_COST_IPEL10_DIR_DIST(omx, top, 2, dist);
+        }
+        if (left >= mv_x_min) {       // check middle left
+            ME_COST_IPEL10_DIR_DIST(left, omy, 4, dist);
+        }
+        if (right <= mv_x_max) {      // check middle right
+            ME_COST_IPEL10_DIR_DIST(right, omy, 5, dist);
+        }
+        if (bottom <= mv_y_max) {     // check bottom
+            ME_COST_IPEL10_DIR_DIST(omx, bottom, 7, dist);
+        }
+    }
+
+    if (mv->bcost < bcost) {
+        rounds = 0;
+    } else if (++rounds >= earlyExitIters) {
+        return;
+    }
+
+    for (dist = 2; dist <= 8; dist <<= 1) {
+        /*          2           points 2, 4, 5, 7 are dist
+         *        1   3         points 1, 3, 6, 8 are dist/2
+         *      4   *   5
+         *        6   8
+         *          7           */
+        omx     = mv->bmx;
+        omy     = mv->bmx;
+        bcost   = mv->bcost;
+        top     = omy - dist;
+        bottom  = omy + dist;
+        left    = omx - dist;
+        right   = omx + dist;
+        top2    = omy - (dist >> 1);
+        bottom2 = omy + (dist >> 1);
+        left2   = omx - (dist >> 1);
+        right2  = omx + (dist >> 1);
+
+        // check border
+        if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) {
+            ME_COST_IPEL10_X4_DIR_DIST(omx,    top,     2, dist,
+                                     left2,  top2,    1, dist >> 1,
+                                     right2, top2,    3, dist >> 1,
+                                     left,   omy,     4, dist);
+            ME_COST_IPEL10_X4_DIR_DIST(right,  omy,     5, dist,
+                                     left2,  bottom2, 6, dist >> 1,
+                                     right2, bottom2, 8, dist >> 1,
+                                     omx,    bottom,  7, dist);
+        } else {
+            if (top >= mv_y_min) {              // check top
+                ME_COST_IPEL10_DIR_DIST(omx, top, 2, dist);
+            }
+            if (top2 >= mv_y_min) {             // check half top
+                if (left2 >= mv_x_min) {        // check half left
+                    ME_COST_IPEL10_DIR_DIST(left2, top2, 1, (dist >> 1));
+                }
+                if (right2 <= mv_x_max) {       // check half right
+                    ME_COST_IPEL10_DIR_DIST(right2, top2, 3, (dist >> 1));
+                }
+            }
+            if (left >= mv_x_min) {             // check left
+                ME_COST_IPEL10_DIR_DIST(left, omy, 4, dist);
+            }
+            if (right <= mv_x_max) {            // check right
+                ME_COST_IPEL10_DIR_DIST(right, omy, 5, dist);
+            }
+            if (bottom2 <= mv_y_max) {          // check half bottom
+                if (left2 >= mv_x_min) {        // check half left
+                    ME_COST_IPEL10_DIR_DIST(left2, bottom2, 6, (dist >> 1));
+                }
+                if (right2 <= mv_x_max) {       // check half right
+                    ME_COST_IPEL10_DIR_DIST(right2, bottom2, 8, (dist >> 1));
+                }
+            }
+            if (bottom <= mv_y_max) {           // check bottom
+                ME_COST_IPEL10_DIR_DIST(omx, bottom, 7, dist);
+            }
+        }
+        if (mv->bcost < bcost) {
+            rounds = 0;
+        } else if (++rounds >= earlyExitIters) {
+            return;
+        }
+    }
+
+    for (dist = 16; dist <= merange; dist <<= 1) {
+        omx    = mv->bmx;
+        omy    = mv->bmx;
+        bcost  = mv->bcost;
+        top    = omy - dist;
+        bottom = omy + dist;
+        left   = omx - dist;
+        right  = omx + dist;
+
+        if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { // check border
+            /* index:        0
+             *               3
+             *               2
+             *               1
+             *       0 3 2 1 * 1 2 3 0
+             *               1
+             *               2
+             *               3
+             *               0                  */
+            ME_COST_IPEL10_X4_DIR_DIST(omx,   top,    0, dist,
+                                     left,  omy,    0, dist,
+                                     right, omy,    0, dist,
+                                     omx,   bottom, 0, dist);
+            for (idx = 1; idx < 4; idx++) {
+                posYT = top    + ((dist >> 2) * idx);
+                posYB = bottom - ((dist >> 2) * idx);
+                posXL = omx    - ((dist >> 2) * idx);
+                posXR = omx    + ((dist >> 2) * idx);
+                ME_COST_IPEL10_X4_DIR_DIST(posXL, posYT, 0, dist,
+                                         posXR, posYT, 0, dist,
+                                         posXL, posYB, 0, dist,
+                                         posXR, posYB, 0, dist);
+            }
+        } else {
+            // check border for each mv
+            if (top >= mv_y_min) {              // check top
+                ME_COST_IPEL10_DIR_DIST(omx, top, 0, dist);
+            }
+            if (left >= mv_x_min) {             // check left
+                ME_COST_IPEL10_DIR_DIST(left, omy, 0, dist);
+            }
+            if (right <= mv_x_max) {            // check right
+                ME_COST_IPEL10_DIR_DIST(right, omy, 0, dist);
+            }
+            if (bottom <= mv_y_max) {           // check bottom
+                ME_COST_IPEL10_DIR_DIST(omx, bottom, 0, dist);
+            }
+
+            for (idx = 1; idx < 4; idx++) {
+                posYT = top    + ((dist >> 2) * idx);
+                posYB = bottom - ((dist >> 2) * idx);
+                posXL = omx    - ((dist >> 2) * idx);
+                posXR = omx    + ((dist >> 2) * idx);
+
+                if (posYT >= mv_y_min) {        // check top
+                    if (posXL >= mv_x_min) {    // check left
+                        ME_COST_IPEL10_DIR_DIST(posXL, posYT, 0, dist);
+                    }
+                    if (posXR <= mv_x_max) {    // check right
+                        ME_COST_IPEL10_DIR_DIST(posXR, posYT, 0, dist);
+                    }
+                }
+                if (posYB <= mv_y_max) {        // check bottom
+                    if (posXL >= mv_x_min) {    // check left
+                        ME_COST_IPEL10_DIR_DIST(posXL, posYB, 0, dist);
+                    }
+                    if (posXR <= mv_x_max) {    // check right
+                        ME_COST_IPEL10_DIR_DIST(posXR, posYB, 0, dist);
+                    }
+                }
+            }
+        }
+
+        if (mv->bcost < bcost) {
+            rounds = 0;
+        } else if (++rounds >= earlyExitIters) {
+            return;
+        }
+    }
+}
+
+// int g_me_time[4] = { 0 };
+
+/* ---------------------------------------------------------------------------
+ * return minimum motion cost after search
+ */
+dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc)
+{
+    /* special version of pack to allow shortcuts in CHECK_MV_RANGE */
+    ALIGNED_ARRAY_16(int, costs,[8]);
+    double beta2  = p_me->beta2 + 1;
+    double beta3  = p_me->beta3 + 1;
+    int i_fref    = p_me->p_fref_1st->i_stride[IMG_Y];
+    int i_pixel   = p_me->i_pixel;
+    int mv_x_min  = p_me->mv_min_fpel[0];
+    int mv_y_min  = p_me->mv_min_fpel[1];
+    int mv_x_max  = p_me->mv_max_fpel[0];
+    int mv_y_max  = p_me->mv_max_fpel[1];
+    int me_range  = h->param->search_range;
+    int lambda    = h->i_lambda_factor; // factor for determining Lagrangian's motion cost
+    const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min);
+    const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000;
+    const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x;
+    const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y;
+    uint32_t pmv;
+    dist_t bcost = MAX_DISTORTION;
+    int bmx = 0, bmy = 0;
+    int omx, omy;
+    int i, j, dir, idx;
+
+    const int umh_1_3_step = h->UMH_big_hex_level == 2 ? 16 : 8;
+    const int8_t(*search_patern)[2] = h->UMH_big_hex_level == 2 ? HEX4 : FAST_HEX4;
+
+    // g_me_time[0]++;
+    /* -------------------------------------------------------------
+     * try MVP and some key searching points */
+    pmv = MAKEDWORD(mvc[0][0], mvc[0][1]);   /* mvc[0][] is the MVP */
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_org = p_me->p_fenc8;
+    pel8_t *p_fref = p_me->p_fref_1st->planes8[IMG_Y] + p_me->i_bias;
+    for (i = 0; i < i_mvc; i++) {
+        int mx = mvc[i][0];
+        int my = mvc[i][1];
+        ME_COST_IPEL8(mx, my);
+    }
+
+    if (bcost == MAX_DISTORTION) {
+        goto _me_error8;         /* me failed */
+    }
+
+    /* -------------------------------------------------------------
+     * search using different method */
+
+    switch (h->param->me_method) {
+    case XAVS2_ME_TZ: {       /* TZ */
+        const int RasterDistance = 16;
+        const int MaxIters = 32;
+        const int EarlyExitIters = 3;
+        dist_t bdist;
+        int mv1_x, mv1_y, mv2_x, mv2_y;
+        mv_info mvinfo;
+
+        omx = bmx;
+        omy = bmy;
+        ME_COST_IPEL8_X3(-2, 0, -1,  2,  1,  2);
+        ME_COST_IPEL8_X3( 2, 0,  1, -2, -1, -2);
+
+        if (CHECK_MV_RANGE(bmx, bmy)) {
+            DIA_ITER8(bmx, bmy);
+        }
+
+        mvinfo.bcost = bcost;
+        mvinfo.bdist = 0;
+        mvinfo.bmx   = bmx;
+        mvinfo.bmy   = bmy;
+        mvinfo.bdir  = 0;
+        tz_pattern_search8(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range);
+        bcost = mvinfo.bcost;
+        bdist = mvinfo.bdist;
+        bmx   = mvinfo.bmx;
+        bmy   = mvinfo.bmy;
+        dir   = mvinfo.bdir;
+
+        if (bdist == 1) {
+            if (!dir) {
+                break;
+            }
+
+            /* if best distance was only 1, check two missing points.
+             * for a given direction 1 to 8, check nearest two outer X pixels*/
+            mv1_x = bmx + offsets[(dir - 1) * 2    ][0];    /*     X   X     */
+            mv1_y = bmy + offsets[(dir - 1) * 2    ][1];    /*   X 1 2 3 X   */
+            mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0];    /*     4 * 5     */
+            mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1];    /*   X 6 7 8 X   */
+            if (CHECK_MV_RANGE(mv1_x, mv1_y)) {             /*     X   X     */
+                ME_COST_IPEL8(mv1_x, mv1_y);
+            }
+            if (CHECK_MV_RANGE(mv2_x, mv2_y)) {
+                ME_COST_IPEL8(mv2_x, mv2_y);
+            }
+
+            /* if no new point is found, stop */
+            if (bcost == mvinfo.bcost) {
+                break;      /* the bcost is not changed */
+            }
+        }
+
+        /* raster search refinement if original search distance was too big */
+        if (bdist > RasterDistance) {
+            const int iRasterDist  = RasterDistance >> 1;
+            const int iRasterDist2 = RasterDistance >> 2;
+            int rmv_y_min = XAVS2_MAX(mv_y_min, bmy - RasterDistance + 2);
+            int rmv_y_max = XAVS2_MIN(mv_y_max, bmy + RasterDistance - 2);
+            int rmv_x_min = XAVS2_MAX(mv_x_min, bmx - RasterDistance + 2);
+            int rmv_x_max = XAVS2_MIN(mv_x_max, bmx + RasterDistance - 2);
+            for (j = rmv_y_min; j < rmv_y_max; j += iRasterDist) {
+                for (i = rmv_x_min; i < rmv_x_max; i += iRasterDist) {
+                    ME_COST_IPEL8_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2);
+                }
+            }
+        }
+
+        while (bdist > 0) {
+            // center a new search around current best
+            mvinfo.bcost = bcost;
+            mvinfo.bdist = 0;
+            mvinfo.bmx   = bmx;
+            mvinfo.bmy   = bmy;
+            mvinfo.bdir  = 0;
+            tz_pattern_search8(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range);
+            bcost = mvinfo.bcost;
+            bdist = mvinfo.bdist;
+            bmx   = mvinfo.bmx;
+            bmy   = mvinfo.bmy;
+            dir   = mvinfo.bdir;
+
+            if (bdist == 1) {
+                /* for a given direction 1 to 8, check nearest 2 outer X pixels */
+                if (dir) {                                       /*    X   X    */
+                    mv1_x = bmx + offsets[(dir - 1) * 2    ][0]; /*  X 1 2 3 X  */
+                    mv1_y = bmy + offsets[(dir - 1) * 2    ][1]; /*    4 * 5    */
+                    mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /*  X 6 7 8 X  */
+                    mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /*    X   X    */
+                    if (CHECK_MV_RANGE(mv1_x, mv1_y)) {
+                        ME_COST_IPEL8(mv1_x, mv1_y);
+                    }
+                    if (CHECK_MV_RANGE(mv2_x, mv2_y)) {
+                        ME_COST_IPEL8(mv2_x, mv2_y);
+                    }
+                }
+                break;
+            }
+        }
+
+        /* equivalent to the above, but eliminates duplicate candidates */
+        goto umh_step8_2;
+    }
+    case XAVS2_ME_UMH:        /* UMH */
+        /* http://www.cnblogs.com/TaigaCon/archive/2014/06/16/3788984.html
+         * 0. åˆå§‹ç‚¹æœç´¢ */
+        DIA_ITER8(mvc[0][0], mvc[0][1]);
+        if (pmv && (bmx != mvc[0][0] || bmy != mvc[0][1])) {
+            DIA_ITER8(bmx, bmy);
+            pmv = MAKEDWORD(bmx, bmy);
+        }
+
+        // select different step according to the different cost from upper layer
+        if (p_me->mvp1.v != 0) {
+            int mx = IPEL(p_me->mvp1.x);
+            int my = IPEL(p_me->mvp1.y);
+            ME_COST_IPEL8(mx, my);
+        }
+        EARLY_TERMINATION8(p_me->pred_sad_uplayer);
+        // g_me_time[1]++;
+
+        // prediction using mv of last ref_idx motion vector
+        if (p_me->i_ref_idx > 0) {
+            ME_COST_IPEL8(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y));
+        }
+        if (p_me->mvp3.v != 0) {
+            ME_COST_IPEL8(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y));
+        }
+
+        /* å½“å‰æœ€ä¼˜MVä¸æ˜¯ MVPï¼Œæœç´¢å…¶å‘¨å›´ä¸€ä¸ªå°çª—å£ */
+        if (pmv != (uint32_t)MAKEDWORD(bmx, bmy)) {
+            DIA_ITER8(bmx, bmy);
+        }
+
+        // early termination algorithm
+        EARLY_TERMINATION8(p_me->pred_sad);
+
+        // umh_step_1:
+        /* UMH 1. Unsymmetrical-cross search ï¼ˆéžå¯¹ç§°åå­—æœç´¢ï¼‰ */
+        // g_me_time[2]++;
+        omx = bmx;
+        omy = bmy;
+        for (i = 1; i <= me_range; i += 2) {
+            ME_COST_IPEL8(omx + i, omy);
+            ME_COST_IPEL8(omx - i, omy);
+        }
+        for (j = 1; j <= me_range / 2; j += 2) {
+            ME_COST_IPEL8(omx, omy + j);
+            ME_COST_IPEL8(omx, omy - j);
+        }
+
+        // early termination algorithm
+        EARLY_TERMINATION8(p_me->pred_sad);
+
+        /* UMH 2. Spiral search ï¼ˆèžºæ—‹æœç´¢ï¼‰ */
+        omx = bmx;
+        omy = bmy;
+        for (i = 0; i < 24; i++) {
+            ME_COST_IPEL8(omx + GRID[i][0], omy + GRID[i][1]);
+        }
+
+        // early termination algorithm
+        EARLY_TERMINATION8(p_me->pred_sad);
+
+        // big hexagon
+        if (h->UMH_big_hex_level) {
+            for (j = 1; j <= me_range / 4; j++) {
+                omx = bmx;
+                omy = bmy;
+                for (i = 0; i < umh_1_3_step; i++) {
+                    ME_COST_IPEL8(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j);
+                }
+                if (bmx != omx || bmy != omy) {
+                    EARLY_TERMINATION8(p_me->pred_sad);
+                }
+            }
+        }
+        /* !!! NO break statement here */
+    case XAVS2_ME_HEX:        /* hexagon search */
+umh_step8_2 :                  /* UMH 3. Uneven Multi-Hexagon-grid Search ï¼ˆä¸è§„å¾‹å…­è¾¹å½¢æ¨¡æ¿æœç´¢ï¼‰ */
+        // g_me_time[3]++;
+        dir = 0;                                        /*   6   5   */
+        omx = bmx;                                      /*           */
+        omy = bmy;                                      /* 1   *   4 */
+        ME_COST_IPEL8_X3_DIR(-1,-2,6,  1,-2,5, -2,0,1);  /*           */
+        ME_COST_IPEL8_X3_DIR( 2, 0,4, -1, 2,2,  1,2,3);  /*   2   3   */
 
-    const int umh_1_3_step = h->UMH_big_hex_level == 2 ? 16 : 8;
-    const int8_t(*search_patern)[2] = h->UMH_big_hex_level == 2 ? HEX4 : FAST_HEX4;
+        if (dir) {
+            const int8_t (*hex)[2];
+            /* UMH 4. Extended Hexagon-based Search ï¼ˆå…­è¾¹å½¢æ¨¡æ¿åå¤æœç´¢ï¼‰ */
+            idx = dir - 1;      /* start array index */
+            /* half hexagon, not overlapping the previous iteration */
+            for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) {
+                dir = 0;
+                omx = bmx;
+                omy = bmy;
+                hex = &HEX2[idx];
+                ME_COST_IPEL8_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3);
+                if (!dir) {
+                    break;      /* early terminate */
+                }
+                idx = M1MOD6[dir + idx - 1];    /* next start array index */
+            }
+        }
+        /* !!! NO break statement here */
+    case XAVS2_ME_DIA:        /* diamond search */
+umh_step8_3:                   /* UMH 5. the third step with a small search pattern ï¼ˆå°è±å½¢æ¨¡æ¿åå¤æœç´¢ï¼‰ */
+        dir = 0;
+        if (CHECK_MV_RANGE(bmx, bmy)) {
+            omx = bmx;                                          /*    4    */
+            omy = bmy;                                          /*  1 * 3  */
+            ME_COST_IPEL8_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2);  /*    2    */
+        }
+        if (dir) {
+            const int8_t (*dia)[2];
+            idx = dir - 1;      /* start array index */
+            /* half diamond, not overlapping the previous iteration */
+            for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) {
+                dir = 0;
+                omx = bmx;
+                omy = bmy;
+                dia = &DIA1[idx];
+                ME_COST_IPEL8_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3);
+                if (!dir) {
+                    break;      /* early terminate */
+                }
+                idx = M1MOD4[dir + idx - 1];    /* next start array index */
+            }
+        }
+        break;
+    default:                    /* XAVS2_ME_FS: full search */
+        omx = bmx;
+        omy = bmy;
+        for (j = -me_range; j < me_range; j++) {
+            for (i = -me_range; i < me_range; i++) {
+                ME_COST_IPEL8(omx + i, omy + j);
+            }
+        }
+        break;
+    }
 
-    // g_me_time[0]++;
     /* -------------------------------------------------------------
-     * try MVP and some key searching points */
-    pmv = MAKEDWORD(mvc[0][0], mvc[0][1]);   /* mvc[0][] is the MVP */
+     * store the results of fullpel search */
+    p_me->bmv.v  = MAKEDWORD(FPEL(bmx), FPEL(bmy));
+    p_me->bmv2.v = MAKEDWORD(bmx, bmy);
+    p_me->bcost  = bcost;
+    p_me->bcost2 = bcost;
+    p_me->mvcost[PDIR_FWD] = MV_COST_IPEL(bmx, bmy);
+
+    /* -------------------------------------------------------------
+     * sub-pel refine */
+    if (h->use_fractional_me) {
+        bcost = me_subpel_refine(h, p_me);
+    }
 
+_me_error8:
+    return bcost;
+    } else {
+    pel10_t *p_org = p_me->p_fenc10;
+    pel10_t *p_fref = p_me->p_fref_1st->planes10[IMG_Y] + p_me->i_bias;
     for (i = 0; i < i_mvc; i++) {
         int mx = mvc[i][0];
         int my = mvc[i][1];
-        ME_COST_IPEL(mx, my);
+        ME_COST_IPEL10(mx, my);
     }
 
     if (bcost == MAX_DISTORTION) {
-        goto _me_error;         /* me failed */
+        goto _me_error10;         /* me failed */
     }
 
     /* -------------------------------------------------------------
      * search using different method */
+
     switch (h->param->me_method) {
     case XAVS2_ME_TZ: {       /* TZ */
         const int RasterDistance = 16;
@@ -933,11 +1682,11 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
 
         omx = bmx;
         omy = bmy;
-        ME_COST_IPEL_X3(-2, 0, -1,  2,  1,  2);
-        ME_COST_IPEL_X3( 2, 0,  1, -2, -1, -2);
+        ME_COST_IPEL10_X3(-2, 0, -1,  2,  1,  2);
+        ME_COST_IPEL10_X3( 2, 0,  1, -2, -1, -2);
 
         if (CHECK_MV_RANGE(bmx, bmy)) {
-            DIA_ITER(bmx, bmy);
+            DIA_ITER10(bmx, bmy);
         }
 
         mvinfo.bcost = bcost;
@@ -945,7 +1694,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         mvinfo.bmx   = bmx;
         mvinfo.bmy   = bmy;
         mvinfo.bdir  = 0;
-        tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range);
+        tz_pattern_search10(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range);
         bcost = mvinfo.bcost;
         bdist = mvinfo.bdist;
         bmx   = mvinfo.bmx;
@@ -964,10 +1713,10 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
             mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0];    /*     4 * 5     */
             mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1];    /*   X 6 7 8 X   */
             if (CHECK_MV_RANGE(mv1_x, mv1_y)) {             /*     X   X     */
-                ME_COST_IPEL(mv1_x, mv1_y);
+                ME_COST_IPEL10(mv1_x, mv1_y);
             }
             if (CHECK_MV_RANGE(mv2_x, mv2_y)) {
-                ME_COST_IPEL(mv2_x, mv2_y);
+                ME_COST_IPEL10(mv2_x, mv2_y);
             }
 
             /* if no new point is found, stop */
@@ -986,7 +1735,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
             int rmv_x_max = XAVS2_MIN(mv_x_max, bmx + RasterDistance - 2);
             for (j = rmv_y_min; j < rmv_y_max; j += iRasterDist) {
                 for (i = rmv_x_min; i < rmv_x_max; i += iRasterDist) {
-                    ME_COST_IPEL_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2);
+                    ME_COST_IPEL10_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2);
                 }
             }
         }
@@ -998,7 +1747,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
             mvinfo.bmx   = bmx;
             mvinfo.bmy   = bmy;
             mvinfo.bdir  = 0;
-            tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range);
+            tz_pattern_search10(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range);
             bcost = mvinfo.bcost;
             bdist = mvinfo.bdist;
             bmx   = mvinfo.bmx;
@@ -1013,10 +1762,10 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
                     mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /*  X 6 7 8 X  */
                     mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /*    X   X    */
                     if (CHECK_MV_RANGE(mv1_x, mv1_y)) {
-                        ME_COST_IPEL(mv1_x, mv1_y);
+                        ME_COST_IPEL10(mv1_x, mv1_y);
                     }
                     if (CHECK_MV_RANGE(mv2_x, mv2_y)) {
-                        ME_COST_IPEL(mv2_x, mv2_y);
+                        ME_COST_IPEL10(mv2_x, mv2_y);
                     }
                 }
                 break;
@@ -1024,14 +1773,14 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         }
 
         /* equivalent to the above, but eliminates duplicate candidates */
-        goto umh_step_2;
+        goto umh_step10_2;
     }
     case XAVS2_ME_UMH:        /* UMH */
         /* http://www.cnblogs.com/TaigaCon/archive/2014/06/16/3788984.html
-         * 0. ³õÊ¼µãËÑË÷ */
-        DIA_ITER(mvc[0][0], mvc[0][1]);
+         * 0. åˆå§‹ç‚¹æœç´¢ */
+        DIA_ITER10(mvc[0][0], mvc[0][1]);
         if (pmv && (bmx != mvc[0][0] || bmy != mvc[0][1])) {
-            DIA_ITER(bmx, bmy);
+            DIA_ITER10(bmx, bmy);
             pmv = MAKEDWORD(bmx, bmy);
         }
 
@@ -1039,53 +1788,53 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         if (p_me->mvp1.v != 0) {
             int mx = IPEL(p_me->mvp1.x);
             int my = IPEL(p_me->mvp1.y);
-            ME_COST_IPEL(mx, my);
+            ME_COST_IPEL10(mx, my);
         }
-        EARLY_TERMINATION(p_me->pred_sad_uplayer);
+        EARLY_TERMINATION10(p_me->pred_sad_uplayer);
         // g_me_time[1]++;
 
         // prediction using mv of last ref_idx motion vector
         if (p_me->i_ref_idx > 0) {
-            ME_COST_IPEL(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y));
+            ME_COST_IPEL10(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y));
         }
         if (p_me->mvp3.v != 0) {
-            ME_COST_IPEL(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y));
+            ME_COST_IPEL10(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y));
         }
 
-        /* µ±Ç°×îÓÅMV²»ÊÇ MVP£¬ËÑË÷ÆäÖÜÎ§Ò»¸öÐ¡´°¿Ú */
+        /* å½“å‰æœ€ä¼˜MVä¸æ˜¯ MVPï¼Œæœç´¢å…¶å‘¨å›´ä¸€ä¸ªå°çª—å£ */
         if (pmv != (uint32_t)MAKEDWORD(bmx, bmy)) {
-            DIA_ITER(bmx, bmy);
+            DIA_ITER10(bmx, bmy);
         }
 
         // early termination algorithm
-        EARLY_TERMINATION(p_me->pred_sad);
+        EARLY_TERMINATION10(p_me->pred_sad);
 
         // umh_step_1:
-        /* UMH 1. Unsymmetrical-cross search £¨·Ç¶Ô³ÆÊ®×ÖËÑË÷£© */
+        /* UMH 1. Unsymmetrical-cross search ï¼ˆéžå¯¹ç§°åå­—æœç´¢ï¼‰ */
         // g_me_time[2]++;
         omx = bmx;
         omy = bmy;
         for (i = 1; i <= me_range; i += 2) {
-            ME_COST_IPEL(omx + i, omy);
-            ME_COST_IPEL(omx - i, omy);
+            ME_COST_IPEL10(omx + i, omy);
+            ME_COST_IPEL10(omx - i, omy);
         }
         for (j = 1; j <= me_range / 2; j += 2) {
-            ME_COST_IPEL(omx, omy + j);
-            ME_COST_IPEL(omx, omy - j);
+            ME_COST_IPEL10(omx, omy + j);
+            ME_COST_IPEL10(omx, omy - j);
         }
 
         // early termination algorithm
-        EARLY_TERMINATION(p_me->pred_sad);
+        EARLY_TERMINATION10(p_me->pred_sad);
 
-        /* UMH 2. Spiral search £¨ÂÝÐýËÑË÷£© */
+        /* UMH 2. Spiral search ï¼ˆèžºæ—‹æœç´¢ï¼‰ */
         omx = bmx;
         omy = bmy;
         for (i = 0; i < 24; i++) {
-            ME_COST_IPEL(omx + GRID[i][0], omy + GRID[i][1]);
+            ME_COST_IPEL10(omx + GRID[i][0], omy + GRID[i][1]);
         }
 
         // early termination algorithm
-        EARLY_TERMINATION(p_me->pred_sad);
+        EARLY_TERMINATION10(p_me->pred_sad);
 
         // big hexagon
         if (h->UMH_big_hex_level) {
@@ -1093,26 +1842,26 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
                 omx = bmx;
                 omy = bmy;
                 for (i = 0; i < umh_1_3_step; i++) {
-                    ME_COST_IPEL(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j);
+                    ME_COST_IPEL10(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j);
                 }
                 if (bmx != omx || bmy != omy) {
-                    EARLY_TERMINATION(p_me->pred_sad);
+                    EARLY_TERMINATION10(p_me->pred_sad);
                 }
             }
         }
         /* !!! NO break statement here */
     case XAVS2_ME_HEX:        /* hexagon search */
-umh_step_2 :                  /* UMH 3. Uneven Multi-Hexagon-grid Search £¨²»¹æÂÉÁù±ßÐÎÄ£°åËÑË÷£© */
+umh_step10_2 :                  /* UMH 3. Uneven Multi-Hexagon-grid Search ï¼ˆä¸è§„å¾‹å…­è¾¹å½¢æ¨¡æ¿æœç´¢ï¼‰ */
         // g_me_time[3]++;
         dir = 0;                                        /*   6   5   */
         omx = bmx;                                      /*           */
         omy = bmy;                                      /* 1   *   4 */
-        ME_COST_IPEL_X3_DIR(-1,-2,6,  1,-2,5, -2,0,1);  /*           */
-        ME_COST_IPEL_X3_DIR( 2, 0,4, -1, 2,2,  1,2,3);  /*   2   3   */
+        ME_COST_IPEL10_X3_DIR(-1,-2,6,  1,-2,5, -2,0,1);  /*           */
+        ME_COST_IPEL10_X3_DIR( 2, 0,4, -1, 2,2,  1,2,3);  /*   2   3   */
 
         if (dir) {
             const int8_t (*hex)[2];
-            /* UMH 4. Extended Hexagon-based Search £¨Áù±ßÐÎÄ£°å·´¸´ËÑË÷£© */
+            /* UMH 4. Extended Hexagon-based Search ï¼ˆå…­è¾¹å½¢æ¨¡æ¿åå¤æœç´¢ï¼‰ */
             idx = dir - 1;      /* start array index */
             /* half hexagon, not overlapping the previous iteration */
             for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) {
@@ -1120,7 +1869,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
                 omx = bmx;
                 omy = bmy;
                 hex = &HEX2[idx];
-                ME_COST_IPEL_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3);
+                ME_COST_IPEL10_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3);
                 if (!dir) {
                     break;      /* early terminate */
                 }
@@ -1129,12 +1878,12 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         }
         /* !!! NO break statement here */
     case XAVS2_ME_DIA:        /* diamond search */
-umh_step_3:                   /* UMH 5. the third step with a small search pattern £¨Ð¡ÁâÐÎÄ£°å·´¸´ËÑË÷£© */
+umh_step10_3:                   /* UMH 5. the third step with a small search pattern ï¼ˆå°è±å½¢æ¨¡æ¿åå¤æœç´¢ï¼‰ */
         dir = 0;
         if (CHECK_MV_RANGE(bmx, bmy)) {
             omx = bmx;                                          /*    4    */
             omy = bmy;                                          /*  1 * 3  */
-            ME_COST_IPEL_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2);  /*    2    */
+            ME_COST_IPEL10_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2);  /*    2    */
         }
         if (dir) {
             const int8_t (*dia)[2];
@@ -1145,7 +1894,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
                 omx = bmx;
                 omy = bmy;
                 dia = &DIA1[idx];
-                ME_COST_IPEL_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3);
+                ME_COST_IPEL10_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3);
                 if (!dir) {
                     break;      /* early terminate */
                 }
@@ -1158,7 +1907,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         omy = bmy;
         for (j = -me_range; j < me_range; j++) {
             for (i = -me_range; i < me_range; i++) {
-                ME_COST_IPEL(omx + i, omy + j);
+                ME_COST_IPEL10(omx + i, omy + j);
             }
         }
         break;
@@ -1178,8 +1927,9 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
         bcost = me_subpel_refine(h, p_me);
     }
 
-_me_error:
+_me_error10:
     return bcost;
+    }
 }
 
 
@@ -1187,13 +1937,98 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc
  * find motion vector for forward dual hypothesis prediction (sub-pel search)
  * return minimum motion cost after search
  */
-dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv)
+dist_t xavs2_me_search_sym8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *mv)
+{
+    const int search_pos2 = 5;  // search positions for    half-pel search  (default: 9)
+    const int search_pos4 = 5;  // search positions for quarter-pel search  (default: 9)
+    pel8_t **p_filtered1 = p_me->p_fref_1st->filtered8;
+    pel8_t **p_filtered2 = p_me->p_fref_2nd->filtered8;
+    pel8_t *p_org = p_me->p_fenc8;
+    int distance_fwd = p_me->i_distance_1st;
+    int distance_bwd = p_me->i_distance_2nd;
+    int i_pixel  = p_me->i_pixel;
+    int i_offset = p_me->i_bias;
+    int ctr_x    = (p_me->mvp1.x >> 1) << 1;
+    int ctr_y    = (p_me->mvp1.y >> 1) << 1;
+    int mv_x_min = p_me->mv_min[0];
+    int mv_y_min = p_me->mv_min[1];
+    int mv_x_max = p_me->mv_max[0];
+    int mv_y_max = p_me->mv_max[1];
+    int lambda   = h->i_lambda_factor;
+    int min_pos2 = (h->param->enable_hadamard ? 0 : 1);
+    int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2);
+    const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min);
+    const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000;
+    const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x;
+    const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y;
+    mv_t bmv = *mv;  // best mv
+    dist_t bcost = MAX_DISTORTION;
+    dist_t cost;
+    int pos;
+    int mx, my;
+    int i_fref = p_me->p_fref_1st->i_stride[IMG_Y];
+
+    if (!h->use_fractional_me) {
+        mx = mv->x;
+        my = mv->y;
+
+        ME_COST_QPEL8_SYM;
+        bcost = cost;
+        bmv.v = MAKEDWORD(mx, my);
+        return bcost;
+    }
+
+    // loop over search positions
+    for (pos = min_pos2; pos < max_pos2; pos++) {
+        mx = mv->x + (Spiral[pos][0] << 1);    // quarter-pel units
+        my = mv->y + (Spiral[pos][1] << 1);    // quarter-pel units
+
+        ME_COST_QPEL8_SYM;
+        if (cost < bcost) {
+            bcost = cost;
+            bmv.v = MAKEDWORD(mx, my);
+        }
+    }
+
+    mv->v = bmv.v;
+
+    /* -------------------------------------------------------------
+     * quarter-pel refine */
+
+    // loop over search positions
+    if (h->use_fractional_me >= 2) {
+        for (pos = 1; pos < search_pos4; pos++) {
+            if (h->param->enable_pmvr) {
+                if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, mv->x, mv->y, Spiral[pos][0], Spiral[pos][1])) {
+                    continue;
+                }
+            } else {
+                mx = mv->x + Spiral[pos][0];    // quarter-pel units
+                my = mv->y + Spiral[pos][1];    // quarter-pel units
+            }
+
+            ME_COST_QPEL8_SYM;
+            if (cost < bcost) {
+                bcost = cost;
+                bmv.v = MAKEDWORD(mx, my);
+            }
+        }
+    }
+
+    mv->v = bmv.v;
+    p_me->mvcost[PDIR_SYM] = MV_COST_FPEL(bmv.x, bmv.y);
+
+    // return minimum motion cost
+    return bcost;
+}
+
+dist_t xavs2_me_search_sym10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *mv)
 {
     const int search_pos2 = 5;  // search positions for    half-pel search  (default: 9)
     const int search_pos4 = 5;  // search positions for quarter-pel search  (default: 9)
-    pel_t **p_filtered1 = p_me->p_fref_1st->filtered;
-    pel_t **p_filtered2 = p_me->p_fref_2nd->filtered;
-    pel_t *p_org = p_me->p_fenc;
+    pel10_t **p_filtered1 = p_me->p_fref_1st->filtered10;
+    pel10_t **p_filtered2 = p_me->p_fref_2nd->filtered10;
+    pel10_t *p_org = p_me->p_fenc10;
     int distance_fwd = p_me->i_distance_1st;
     int distance_bwd = p_me->i_distance_2nd;
     int i_pixel  = p_me->i_pixel;
@@ -1222,7 +2057,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
         mx = mv->x;
         my = mv->y;
 
-        ME_COST_QPEL_SYM;
+        ME_COST_QPEL10_SYM;
         bcost = cost;
         bmv.v = MAKEDWORD(mx, my);
         return bcost;
@@ -1233,7 +2068,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
         mx = mv->x + (Spiral[pos][0] << 1);    // quarter-pel units
         my = mv->y + (Spiral[pos][1] << 1);    // quarter-pel units
 
-        ME_COST_QPEL_SYM;
+        ME_COST_QPEL10_SYM;
         if (cost < bcost) {
             bcost = cost;
             bmv.v = MAKEDWORD(mx, my);
@@ -1257,7 +2092,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
                 my = mv->y + Spiral[pos][1];    // quarter-pel units
             }
 
-            ME_COST_QPEL_SYM;
+            ME_COST_QPEL10_SYM;
             if (cost < bcost) {
                 bcost = cost;
                 bmv.v = MAKEDWORD(mx, my);
@@ -1275,11 +2110,127 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
 /* ---------------------------------------------------------------------------
  * return minimum motion cost after search (sub-pel search)
  */
-dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc)
+dist_t xavs2_me_search_bid8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc)
+{
+    pel8_t **p_filtered1 = p_me->p_fref_1st->filtered8;
+    pel8_t **p_filtered2 = p_me->p_fref_2nd->filtered8;
+    pel8_t *p_org = p_me->p_fenc8;
+    const int search_pos2 = 9;  // search positions for    half-pel search  (default: 9)
+    const int search_pos4 = 9;  // search positions for quarter-pel search  (default: 9)
+    int i_pixel  = p_me->i_pixel;
+    int i_offset = p_me->i_bias;
+    int ctr_x    = (p_me->mvp1.x >> 1) << 1;
+    int ctr_y    = (p_me->mvp1.y >> 1) << 1;
+    int mv_x_min = p_me->mv_min[0];
+    int mv_y_min = p_me->mv_min[1];
+    int mv_x_max = p_me->mv_max[0];
+    int mv_y_max = p_me->mv_max[1];
+    int lambda   = h->i_lambda_factor;
+    int min_pos2 = (h->param->enable_hadamard ? 0 : 1);
+    int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2);
+    int block_w = p_me->i_block_w;
+    int xx2;
+    int yy2;
+    int mv_bid_bit;
+    const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min);
+    const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000;
+    const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp1.x;
+    const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp1.y;
+    const uint16_t *p_cost_bix = h->mvbits - p_me->mvp2.x;
+    const uint16_t *p_cost_biy = h->mvbits - p_me->mvp2.y;
+    mv_t bmv = *fwd_mv; // best mv
+    dist_t bcost = MAX_DISTORTION;
+    dist_t cost;
+    int mx, my, mx_bid, my_bid;
+    int pos;
+    int i_fref = p_me->p_fref_1st->i_stride[IMG_Y];
+    coeff_t *cur_blk = p_enc->coeff_blk;
+
+    mx_bid = bwd_mv->x;
+    my_bid = bwd_mv->y;
+
+    //åœ¨è¿™é‡ŒæŠŠç¼–ç å€¼ä¸Žé¢„æµ‹å€¼çš„è®¡ç®—å…¬å¼æ¢ç®—ä¸º2å€ç¼–ç å€¼-åŽå‘é¢„æµ‹å€¼
+    xx2 = mx_bid >> 2;
+    yy2 = my_bid >> 2;
+    mv_bid_bit = MV_COST_FPEL_BID(mx_bid, my_bid);
+
+    if (CHECK_MV_RANGE(mx_bid, my_bid)) {
+        pel8_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)];
+
+        if (p_src2 != NULL) {
+            p_src2 += i_offset + yy2 * i_fref + xx2;
+            g_funcs.pixf.sub_ps8[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A
+        } else {
+            ALIGN32(pel8_t tmp_pred8[MAX_CU_SIZE * MAX_CU_SIZE]);
+            mv_t mvt;
+            mvt.x = (int16_t)mx_bid;
+            mvt.y = (int16_t)my_bid;
+            get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, block_w, p_me->i_block_h);
+            mc_luma8(h, tmp_pred8, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd);
+            g_funcs.pixf.sub_ps8[i_pixel](cur_blk, block_w, p_org, tmp_pred8, FENC_STRIDE, MAX_CU_SIZE);//M-A
+        }
+        g_funcs.pixf.add_ps8[i_pixel](h, buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M
+    }
+
+    if (!h->use_fractional_me) {
+        mx = fwd_mv->x;
+        my = fwd_mv->y;
+
+        ME_COST_QPEL8_BID;
+        bcost = cost;
+        bmv.v = MAKEDWORD(mx, my);
+        return bcost;
+    }
+
+    // loop over search positions
+    for (pos = min_pos2; pos < max_pos2; pos++) {
+        mx = fwd_mv->x + (Spiral[pos][0] << 1);    // quarter-pel units
+        my = fwd_mv->y + (Spiral[pos][1] << 1);    // quarter-pel units
+
+        ME_COST_QPEL8_BID;
+        if (cost < bcost) {
+            bcost = cost;
+            bmv.v = MAKEDWORD(mx, my);
+        }
+    }
+
+    fwd_mv->v = bmv.v;
+
+    /* -------------------------------------------------------------
+     * quarter-pel refine */
+
+    // loop over search positions
+    if (h->use_fractional_me >= 2) {
+        for (pos = 1; pos < search_pos4; pos++) {
+            if (h->param->enable_pmvr) {
+                if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, fwd_mv->x, fwd_mv->y, Spiral[pos][0], Spiral[pos][1])) {
+                    continue;
+                }
+            } else {
+                mx = fwd_mv->x + Spiral[pos][0];    // quarter-pel units
+                my = fwd_mv->y + Spiral[pos][1];    // quarter-pel units
+            }
+
+            ME_COST_QPEL8_BID;
+            if (cost < bcost) {
+                bcost = cost;
+                bmv.v = MAKEDWORD(mx, my);
+            }
+        }
+    }
+
+    fwd_mv->v = bmv.v;
+    p_me->mvcost[PDIR_BID] = MV_COST_FPEL(bmv.x, bmv.y) + MV_COST_FPEL_BID(mx_bid, my_bid);
+
+    // return minimum motion cost
+    return bcost;
+}
+
+dist_t xavs2_me_search_bid10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc)
 {
-    pel_t **p_filtered1 = p_me->p_fref_1st->filtered;
-    pel_t **p_filtered2 = p_me->p_fref_2nd->filtered;
-    pel_t *p_org = p_me->p_fenc;
+    pel10_t **p_filtered1 = p_me->p_fref_1st->filtered10;
+    pel10_t **p_filtered2 = p_me->p_fref_2nd->filtered10;
+    pel10_t *p_org = p_me->p_fenc10;
     const int search_pos2 = 9;  // search positions for    half-pel search  (default: 9)
     const int search_pos4 = 9;  // search positions for quarter-pel search  (default: 9)
     int i_pixel  = p_me->i_pixel;
@@ -1314,34 +2265,34 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
     mx_bid = bwd_mv->x;
     my_bid = bwd_mv->y;
 
-    //ÔÚÕâÀï°Ñ±àÂëÖµÓëÔ¤²âÖµµÄ¼ÆËã¹«Ê½»»ËãÎª2±¶±àÂëÖµ-ºóÏòÔ¤²âÖµ
+    //åœ¨è¿™é‡ŒæŠŠç¼–ç å€¼ä¸Žé¢„æµ‹å€¼çš„è®¡ç®—å…¬å¼æ¢ç®—ä¸º2å€ç¼–ç å€¼-åŽå‘é¢„æµ‹å€¼
     xx2 = mx_bid >> 2;
     yy2 = my_bid >> 2;
     mv_bid_bit = MV_COST_FPEL_BID(mx_bid, my_bid);
 
     if (CHECK_MV_RANGE(mx_bid, my_bid)) {
-        pel_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)];
+        pel10_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)];
 
         if (p_src2 != NULL) {
             p_src2 += i_offset + yy2 * i_fref + xx2;
-            g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A
+            g_funcs.pixf.sub_ps10[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A
         } else {
-            ALIGN32(pel_t tmp_pred[MAX_CU_SIZE * MAX_CU_SIZE]);
+            ALIGN32(pel10_t tmp_pred10[MAX_CU_SIZE * MAX_CU_SIZE]);
             mv_t mvt;
             mvt.x = (int16_t)mx_bid;
             mvt.y = (int16_t)my_bid;
             get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, block_w, p_me->i_block_h);
-            mc_luma(tmp_pred, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd);
-            g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, tmp_pred, FENC_STRIDE, MAX_CU_SIZE);//M-A
+            mc_luma10(h, tmp_pred10, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd);
+            g_funcs.pixf.sub_ps10[i_pixel](cur_blk, block_w, p_org, tmp_pred10, FENC_STRIDE, MAX_CU_SIZE);//M-A
         }
-        g_funcs.pixf.add_ps[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M
+        g_funcs.pixf.add_ps10[i_pixel](h, buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M
     }
 
     if (!h->use_fractional_me) {
         mx = fwd_mv->x;
         my = fwd_mv->y;
 
-        ME_COST_QPEL_BID;
+        ME_COST_QPEL10_BID;
         bcost = cost;
         bmv.v = MAKEDWORD(mx, my);
         return bcost;
@@ -1352,7 +2303,7 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
         mx = fwd_mv->x + (Spiral[pos][0] << 1);    // quarter-pel units
         my = fwd_mv->y + (Spiral[pos][1] << 1);    // quarter-pel units
 
-        ME_COST_QPEL_BID;
+        ME_COST_QPEL10_BID;
         if (cost < bcost) {
             bcost = cost;
             bmv.v = MAKEDWORD(mx, my);
@@ -1376,7 +2327,7 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp,
                 my = fwd_mv->y + Spiral[pos][1];    // quarter-pel units
             }
 
-            ME_COST_QPEL_BID;
+            ME_COST_QPEL10_BID;
             if (cost < bcost) {
                 bcost = cost;
                 bmv.v = MAKEDWORD(mx, my);
diff --git a/source/encoder/me.h b/source/encoder/me.h
index 2d88fb2..1eb0e93 100644
--- a/source/encoder/me.h
+++ b/source/encoder/me.h
@@ -120,9 +120,13 @@ void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp);
 #define xavs2_me_search FPFX(me_search)
 dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc);
 
-#define xavs2_me_search_sym FPFX(me_search_sym)
-dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv);
-#define xavs2_me_search_bid FPFX(me_search_bid)
-dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc);
+#define xavs2_me_search_sym8 FPFX(me_search_sym8)
+dist_t xavs2_me_search_sym8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *mv);
+#define xavs2_me_search_sym10 FPFX(me_search_sym10)
+dist_t xavs2_me_search_sym10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *mv);
+#define xavs2_me_search_bid8 FPFX(me_search_bid8)
+dist_t xavs2_me_search_bid8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc);
+#define xavs2_me_search_bid10 FPFX(me_search_bid10)
+dist_t xavs2_me_search_bid10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc);
 
 #endif  // XAVS2_ME_H
diff --git a/source/encoder/parameters.c b/source/encoder/parameters.c
index 9adec5b..13dea95 100644
--- a/source/encoder/parameters.c
+++ b/source/encoder/parameters.c
@@ -207,7 +207,7 @@ mapping_default(xavs2_param_map_t *p_map_tab, xavs2_param_t *p)
     MAP("ALFLowLatencyEncodingEnable",  &p->alf_LowLatencyEncoding,     MAP_NUM, "Enable Low Latency ALF (1=Low Latency mode, 0=High Efficiency mode)");
     MAP("CrossSliceLoopFilter",         &p->b_cross_slice_loop_filter,  MAP_NUM, "Enable Cross Slice Boundary Filter (0=Disable, 1=Enable)");
 
-    /* ³¡±àÂë²ÎÊý */
+    /* åœºç¼–ç å‚æ•° */
     // MAP("InterlaceCodingOption",        &p->InterlaceCodingOption,      MAP_NUM);
     // MAP("RepeatFirstField",             &p->repeat_first_field,         MAP_NUM);
     // MAP("TopFieldFirst",                &p->top_field_first,            MAP_NUM);
@@ -425,7 +425,7 @@ int ParameterNameToMapIndex(xavs2_param_map_t *p_map_tab, const char *param_name
     mapping_t *map_tab = p_map_tab->map_tab;
     int i = 0;
 
-    while (map_tab[i].name[0] != '\0') {  // ÖÕÖ¹Î»ÖÃÊÇ¿Õ×Ö·û´®
+    while (map_tab[i].name[0] != '\0') {  // ç»ˆæ­¢ä½ç½®æ˜¯ç©ºå­—ç¬¦ä¸²
         if (xavs2_param_match(map_tab[i].name, param_name)) {
             return i;
         } else {
@@ -439,7 +439,7 @@ int ParameterNameToMapIndex(xavs2_param_map_t *p_map_tab, const char *param_name
 /* ---------------------------------------------------------------------------
  */
 static INLINE
-void get_param_name(char *name, const char *param_item)
+void get_param_name(char *name, char *param_item)
 {
     char *str;
     name[0] = '\0';
@@ -629,7 +629,7 @@ xavs2_encoder_opt_set(xavs2_param_t *param, int argc, char *argv[])
     int   in_item = 0;
     int   i;
 
-    if ((contents = xavs2_get_configs(argc, argv)) == NULL) {
+    if ((contents = xavs2_get_configs(argc, (const char * const *)argv)) == NULL) {
         fprintf(stderr, "get contents from configure file error.");
         return -1;
     }
diff --git a/source/encoder/presets.c b/source/encoder/presets.c
index dd6f5af..2143112 100644
--- a/source/encoder/presets.c
+++ b/source/encoder/presets.c
@@ -53,7 +53,7 @@
  * ===========================================================================
  */
 /* ---------------------------------------------------------------------------
- * Ö¡ÄÚÁÁ¶È¿éµÄRDOÄ£Ê½ÊýÁ¿£¬¶ÔÓ¦²»Í¬presetµµ´Î
+ * å¸§å†…äº®åº¦å—çš„RDOæ¨¡å¼æ•°é‡ï¼Œå¯¹åº”ä¸åŒpresetæ¡£æ¬¡
  */
 static const uint8_t INTRA_FULL_RDO_NUM[][MAX_CU_SIZE_IN_BIT + 1] = {
     { 0, 0, 1, 1, 1, 1, 1 },         /* 0:  1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */
@@ -69,13 +69,13 @@ static const uint8_t INTRA_FULL_RDO_NUM[][MAX_CU_SIZE_IN_BIT + 1] = {
 };
 
 /* ---------------------------------------------------------------------------
- * Ö¡ÄÚÉ«¶È¿é RDO µÄ×î´óÄ£Ê½ÊýÁ¿ (²»Í¬presetµµ´Î)
+ * å¸§å†…è‰²åº¦å— RDO çš„æœ€å¤§æ¨¡å¼æ•°é‡ (ä¸åŒpresetæ¡£æ¬¡)
  */
 static const int8_t tab_num_rdo_chroma_intra_mode[] = {
     1, 2, 2, 2, 3, 3, 4, 4, 5, 5
 };
 
-/* Ö¡ÄÚRMDËÑË÷µÄãÐÖµ£¬²½³¤Îª2ºÍ1ËÑË÷µÄ½Ç¶ÈÊýÁ¿ */
+/* å¸§å†…RMDæœç´¢çš„é˜ˆå€¼ï¼Œæ­¥é•¿ä¸º2å’Œ1æœç´¢çš„è§’åº¦æ•°é‡ */
 static const int8_t tab_num_angle_dist2[] = {
     0, 0, 4, 4, 4, 4, 5, 5, 6, 6
 };
@@ -84,14 +84,14 @@ static const int8_t tab_num_angle_dist1[] = {
 };
 
 /* ---------------------------------------------------------------------------
- * È«Áã¿é¼ì²âÊ±µÄÅÐ¶¨ãÐÖµ±¶ÂÊ
+ * å…¨é›¶å—æ£€æµ‹æ—¶çš„åˆ¤å®šé˜ˆå€¼å€çŽ‡
  */
 static const float tab_th_zero_block_factor[] = {
     6, 6, 6, 6, 6, 6, 5, 5, 5, 5
 };
 
 /* ---------------------------------------------------------------------------
- * QSFDËã·¨µÄãÐÖµ¼ÆËãÏµÊý£¨²»Í¬preset£©
+ * QSFDç®—æ³•çš„é˜ˆå€¼è®¡ç®—ç³»æ•°ï¼ˆä¸åŒpresetï¼‰
  */
 const static double tab_qsfd_s_presets[][10] = {
     /* preset_level:
@@ -103,21 +103,23 @@ const static double tab_qsfd_cu_size_weight[4] = {
     0.25, 1.0, 3.0, 7.5  /* 8x8, 16x16, 32x32, 64x64 */
 };
 
-double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH];
+//extern double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH];
 
 /*--------------------------------------------------------------------------
  */
 static INLINE
 void algorithm_init_thresholds(xavs2_param_t *p_param)
 {
+    double tab_qsfd_thres[MAX_QP + (p_param->sample_bit_depth - 8) * 8][2][CTU_DEPTH];
     int i_preset_level = p_param->preset_level;
     //trade-off encoding time and performance
     const double s_inter = tab_qsfd_s_presets[0][i_preset_level];
     const double s_intra = tab_qsfd_s_presets[1][i_preset_level];
     int i;
 
+    int max_qp = MAX_QP + (p_param->sample_bit_depth - 8) * 8;
     /* QSFD threasholds */
-    for (i = 0; i < MAX_QP; i++) {
+    for (i = 0; i < max_qp; i++) {
         double qstep = 32768.0 / tab_Q_TAB[i];
         double th_base = 350 * pow(qstep, 0.9);
         double th__8 = th_base * tab_qsfd_cu_size_weight[0];
@@ -140,7 +142,7 @@ void algorithm_init_thresholds(xavs2_param_t *p_param)
         tab_qsfd_thres[i][1][3] = th_64 * s_intra * 1.0;
     }
 
-    /* È«Áã¿é¼ì²â */
+    /* å…¨é›¶å—æ£€æµ‹ */
     p_param->factor_zero_block = tab_th_zero_block_factor[i_preset_level];
 }
 
@@ -164,7 +166,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level)
         p_param->num_max_ref = XAVS2_MIN(i_preset_level, 4);
     }
 
-    /* --------------------------- CU½á¹¹ ---------------------------
+    /* --------------------------- CUç»“æž„ ---------------------------
     | preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
     +=================+=====+=====+=====+=====+=====+=====+======+======+======+======+
     | ctu             | 32  | 32  | 64  |  64 |  64 |  64 |  64  |  64  |  64  | 64   |
@@ -172,7 +174,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level)
     */
     p_param->lcu_bit_level = XAVS2_MIN(p_param->lcu_bit_level, 5 + (i_preset_level > 1));
 
-    /* --------------------------- Ô¤²â ---------------------------
+    /* --------------------------- é¢„æµ‹ ---------------------------
     */
     p_param->inter_2pu       = i_preset_level > 1;
     p_param->enable_amp      = i_preset_level > 5;  // NSQT
@@ -183,17 +185,17 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level)
     p_param->enable_dhp      = i_preset_level > 7 && p_param->enable_f_frame;
     p_param->enable_dmh      = i_preset_level > 6 && p_param->enable_f_frame;
 
-    /* --------------------------- ±ä»» --------------------------- */
+    /* --------------------------- å˜æ¢ --------------------------- */
     p_param->enable_sdip       = i_preset_level > 5;
     p_param->enable_nsqt       = i_preset_level > 5;
     p_param->enable_secT       = i_preset_level > -1;
     p_param->b_fast_2lelvel_tu = i_preset_level < 4;
 
-    /* --------------------------- Á¿»¯ ---------------------------
+    /* --------------------------- é‡åŒ– ---------------------------
      * Level: All for preset 9, Off for preset 0~2 */
     p_param->i_rdoq_level = i_preset_level > 8 ? RDOQ_ALL : i_preset_level > 5 ? RDOQ_CU_LEVEL : RDOQ_OFF;
 
-    /* --------------------------- RDOµµ´Î ---------------------------
+    /* --------------------------- RDOæ¡£æ¬¡ ---------------------------
     */
     if (i_preset_level < 0) {
         p_param->i_rd_level = RDO_OFF;
@@ -205,7 +207,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level)
         p_param->i_rd_level = RDO_ALL;
     }
 
-    /* --------------------------- ìØ±àÂë ---------------------------
+    /* --------------------------- ç†µç¼–ç  ---------------------------
      */
     if (i_preset_level <= 3) {
         p_param->rdo_bit_est_method = 2;
@@ -215,13 +217,13 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level)
         p_param->rdo_bit_est_method = 0;
     }
 
-    /* --------------------------- ÂË²¨ ---------------------------
+    /* --------------------------- æ»¤æ³¢ ---------------------------
     */
     p_param->enable_alf = p_param->enable_alf && i_preset_level > 4;
     p_param->enable_sao = p_param->enable_sao && i_preset_level > 1;
-    p_param->b_fast_sao = i_preset_level < 5;  // µµ´Î4ÒÔÏÂ¿ªÆô¿ìËÙSAO±àÂë¾ö²ß
+    p_param->b_fast_sao = i_preset_level < 5;  // æ¡£æ¬¡4ä»¥ä¸‹å¼€å¯å¿«é€ŸSAOç¼–ç å†³ç­–
 
-    /* --------------------------- ÆäËû ---------------------------
+    /* --------------------------- å…¶ä»– ---------------------------
     */
     p_param->enable_hadamard = i_preset_level > 0;
     p_param->enable_tdrdo    = i_preset_level > 4 && p_param->enable_tdrdo;
@@ -329,8 +331,8 @@ void encoder_set_fast_algorithms(xavs2_t *h)
      * 1, switch on some algorithms with little efficiency loss
      */
 
-    /* ÊÇ·ñÐèÒª·ÖÏñËØÔË¶¯ËÑË÷
-     * ²Î¿¼Ö¡ÊýÁ¿´óÓÚ1¸öÊ±£¬»á³öÏÖMVµÄËõ·Å¶øµ¼ÖÂMVÏñËØ¾«¶È´ïµ½1/4
+    /* æ˜¯å¦éœ€è¦åˆ†åƒç´ è¿åŠ¨æœç´¢
+     * å‚è€ƒå¸§æ•°é‡å¤§äºŽ1ä¸ªæ—¶ï¼Œä¼šå‡ºçŽ°MVçš„ç¼©æ”¾è€Œå¯¼è‡´MVåƒç´ ç²¾åº¦è¾¾åˆ°1/4
      */
     if (i_preset_level < 2) {
         h->use_fractional_me = 1;
@@ -355,16 +357,24 @@ void encoder_set_fast_algorithms(xavs2_t *h)
     } else {
         memcpy(h->tab_num_intra_rdo, INTRA_FULL_RDO_NUM[i_preset_level >> 0], sizeof(h->tab_num_intra_rdo));
     }
-    /* RMDËã·¨µÄËÑË÷½Ç¶ÈÊýÁ¿ */
+    /* RMDç®—æ³•çš„æœç´¢è§’åº¦æ•°é‡ */
     h->num_intra_rmd_dist2  = tab_num_angle_dist2[i_preset_level];
     h->num_intra_rmd_dist1  = tab_num_angle_dist1[i_preset_level];
     h->num_rdo_intra_chroma = tab_num_rdo_chroma_intra_mode[i_preset_level];
 
-    /* Ö¡ÄÚÔ¤²âÄ£Ê½ */
+    /* å¸§å†…é¢„æµ‹æ¨¡å¼ */
+    if (h->param->input_sample_bit_depth == 8) {
     if (IS_ALG_ENABLE(OPT_FAST_INTRA_MODE)) {
-        h->get_intra_candidates_luma = rdo_get_pred_intra_luma_rmd;
+        h->get_intra_candidates_luma8 = rdo_get_pred_intra_luma8_rmd;
     } else {
-        h->get_intra_candidates_luma = rdo_get_pred_intra_luma;
+        h->get_intra_candidates_luma8 = rdo_get_pred_intra_luma8;
+    }
+    } else {
+    if (IS_ALG_ENABLE(OPT_FAST_INTRA_MODE)) {
+        h->get_intra_candidates_luma10 = rdo_get_pred_intra_luma10_rmd;
+    } else {
+        h->get_intra_candidates_luma10 = rdo_get_pred_intra_luma10;
+    }
     }
     if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) {
         h->get_intra_candidates_chroma = rdo_get_pred_intra_chroma_fast;
diff --git a/source/encoder/ratecontrol.c b/source/encoder/ratecontrol.c
index f071a68..33bb596 100644
--- a/source/encoder/ratecontrol.c
+++ b/source/encoder/ratecontrol.c
@@ -187,10 +187,9 @@ static const double tab_qp_gpp[3][3] = {
 /* ---------------------------------------------------------------------------
 * compute the gradient per pixel
 */
-static double cal_frame_gradient(xavs2_frame_t *frm)
+static double cal_frame_gradient(xavs2_t *h, xavs2_frame_t *frm)
 {
     double grad_per_pixel = 0;        // gradient per pixel
-    pel_t *src = frm->planes[IMG_Y];// pointer to luma component
     int width = frm->i_width[IMG_Y];
     int height = frm->i_lines[IMG_Y];
     int stride = frm->i_stride[IMG_Y];
@@ -199,6 +198,24 @@ static double cal_frame_gradient(xavs2_frame_t *frm)
 
     width--;
     height--;
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *src = frm->planes8[IMG_Y];// pointer to luma component
+    for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+            int dx = src[j] - src[j + 1];
+            int dy = src[j] - src[j + stride];
+
+            if (dx || dy) {
+                grad_per_pixel += sqrt((double)(dx * dx + dy * dy));
+            }
+        }
+        src += stride;
+    }
+
+    return grad_per_pixel / size;
+    } else {
+    pel10_t *src = frm->planes10[IMG_Y];// pointer to luma component
     for (i = 0; i < height; i++) {
         for (j = 0; j < width; j++) {
             int dx = src[j] - src[j + 1];
@@ -212,6 +229,7 @@ static double cal_frame_gradient(xavs2_frame_t *frm)
     }
 
     return grad_per_pixel / size;
+    }
 }
 #endif
 
@@ -341,7 +359,7 @@ static int rc_calculate_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int forc
     /* compute the initial qp */
     if (frm_idx == 0) {
         double bit = log(1000 * rc->f_target_bpp);
-        double gpp = log(cal_frame_gradient(h->fenc));
+        double gpp = log(cal_frame_gradient(h, h->fenc));
         int    idx = XAVS2_MIN(2, rc->i_intra_period);
         int    max_i_qp = 63 + (h->param->sample_bit_depth - 8) * 8 - 10;
 
@@ -617,7 +635,7 @@ int xavs2_rc_get_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int force_qp)
 */
 int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp)
 {
-    UNUSED_PARAMETER(h);
+    //UNUSED_PARAMETER(h);
     UNUSED_PARAMETER(frm_idx);
 
     //if (h->param->i_rc_method == XAVS2_RC_CBR_SCU && img->current_mb_nr == 0) {
@@ -691,7 +709,7 @@ int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp)
 */
 void xavs2_rc_update_after_lcu_coded(xavs2_t *h, int frm_idx, int qp)
 {
-    UNUSED_PARAMETER(h);
+    //UNUSED_PARAMETER(h);
     UNUSED_PARAMETER(frm_idx);
     UNUSED_PARAMETER(qp);
 
diff --git a/source/encoder/rdo.c b/source/encoder/rdo.c
index 6bfbff1..d679600 100644
--- a/source/encoder/rdo.c
+++ b/source/encoder/rdo.c
@@ -38,7 +38,7 @@
 #include "rdo.h"
 #include "cudata.h"
 #include "aec.h"
-#include "common/mc.h"
+#include "mc.h"
 #include "transform.h"
 #include "block_info.h"
 #include "wquant.h"
@@ -58,8 +58,8 @@
 /* ---------------------------------------------------------------------------
  */
 static const float SUBCU_COST_RATE[2][4] = {
-    {0.50f, 0.75f, 0.97f, 1.0f},   /* Ö¡ÄÚCUµÄCostÒ»°ã¶¼½Ï´ó */
-    {0.75f, 0.90f, 0.99f, 1.0f},   /* Ö¡¼äÇé¿öÏÂ£¬Skip¿éCostºÜÐ¡ */
+    {0.50f, 0.75f, 0.97f, 1.0f},   /* å¸§å†…CUçš„Costä¸€èˆ¬éƒ½è¾ƒå¤§ */
+    {0.75f, 0.90f, 0.99f, 1.0f},   /* å¸§é—´æƒ…å†µä¸‹ï¼ŒSkipå—Costå¾ˆå° */
 };
 
 static const int tab_pdir_bskip[DS_MAX_NUM] = {
@@ -124,7 +124,7 @@ static const int8_t headerbits_skipmode[8] = { 2, 3, 4, 4, 3, 4, 5, 5 };//tempor
  */
 
 /* ---------------------------------------------------------------------------
- * ÒÀ¾ÝCU»®·ÖÄ£Ê½È·¶¨µ±Ç°CU°üº¬µÄPUÊýÁ¿ºÍ´óÐ¡£¨Ö¡¼ä»®·Ö£©
+ * ä¾æ®CUåˆ’åˆ†æ¨¡å¼ç¡®å®šå½“å‰CUåŒ…å«çš„PUæ•°é‡å’Œå¤§å°ï¼ˆå¸§é—´åˆ’åˆ†ï¼‰
  */
 static ALWAYS_INLINE
 void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode)
@@ -137,7 +137,7 @@ void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode)
 
     // set for each block
     if (i_mode == PRED_SKIP) {
-        ///! Ò»Ð©ÌØÊâµÄSkip/DirectÄ£Ê½ÏÂÈç¹ûCU³¬¹ý8x8£¬ÔòPU»®·Ö³É4¸ö
+        ///! ä¸€äº›ç‰¹æ®Šçš„Skip/Directæ¨¡å¼ä¸‹å¦‚æžœCUè¶…è¿‡8x8ï¼Œåˆ™PUåˆ’åˆ†æˆ4ä¸ª
         if (i_level > 3 && (h->i_type == SLICE_TYPE_P || (h->i_type == SLICE_TYPE_F && ds_mode == DS_NONE)
                             || (h->i_type == SLICE_TYPE_B && ds_mode == DS_NONE))) {
             p_cu_info->num_pu = 4;
@@ -158,7 +158,7 @@ void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode)
 }
 
 /* ---------------------------------------------------------------------------
- * ÒÀ¾ÝCU»®·ÖÄ£Ê½È·¶¨µ±Ç°CU°üº¬µÄPUÊýÁ¿ºÍ´óÐ¡£¨Ö¡ÄÚ»®·Ö£©
+ * ä¾æ®CUåˆ’åˆ†æ¨¡å¼ç¡®å®šå½“å‰CUåŒ…å«çš„PUæ•°é‡å’Œå¤§å°ï¼ˆå¸§å†…åˆ’åˆ†ï¼‰
  */
 static ALWAYS_INLINE
 void cu_init_pu_intra(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode)
@@ -208,31 +208,59 @@ void cu_init(xavs2_t *h, cu_t *p_cu, cu_info_t *best, int i_level)
     cu_layer_t *p_layer  = cu_get_layer(h, i_level);
     int i;
 
+    if (h->param->input_sample_bit_depth == 8) {
     /* Ping-pong buffer */
-    p_layer->buf_pred_inter      = p_layer->buf_pred_inter_luma[0];
-    p_layer->buf_pred_inter_best = p_layer->buf_pred_inter_luma[1];
+    p_layer->buf_pred_inter8      = p_layer->buf_pred_inter_luma8[0];
+    p_layer->buf_pred_inter8_best = p_layer->buf_pred_inter_luma8[1];
 
     /* init rec and coeff pointer */
-    p_cu->cu_info.p_rec  [0]      = p_layer->rec_buf_y [0];
+    p_cu->cu_info.p_rec8  [0]      = p_layer->rec8_buf_y [0];
     p_cu->cu_info.p_coeff[0]      = p_layer->coef_buf_y[0];
-    p_layer->p_rec_tmp   [0]      = p_layer->rec_buf_y [1];
+    p_layer->p_rec8_tmp   [0]      = p_layer->rec8_buf_y [1];
     p_layer->p_coeff_tmp [0]      = p_layer->coef_buf_y[1];
-    best->p_rec          [0]      = p_layer->rec_buf_y [2];
+    best->p_rec8          [0]      = p_layer->rec8_buf_y [2];
     best->p_coeff        [0]      = p_layer->coef_buf_y[2];
 
-    p_cu->cu_info.p_rec  [1]      = p_layer->rec_buf_uv [0][0];
+    p_cu->cu_info.p_rec8  [1]      = p_layer->rec8_buf_uv [0][0];
     p_cu->cu_info.p_coeff[1]      = p_layer->coef_buf_uv[0][0];
-    p_layer->p_rec_tmp   [1]      = p_layer->rec_buf_uv [0][1];
+    p_layer->p_rec8_tmp   [1]      = p_layer->rec8_buf_uv [0][1];
     p_layer->p_coeff_tmp [1]      = p_layer->coef_buf_uv[0][1];
-    best->p_rec          [1]      = p_layer->rec_buf_uv [0][2];
+    best->p_rec8          [1]      = p_layer->rec8_buf_uv [0][2];
     best->p_coeff        [1]      = p_layer->coef_buf_uv[0][2];
 
-    p_cu->cu_info.p_rec  [2]      = p_layer->rec_buf_uv [1][0];
+    p_cu->cu_info.p_rec8  [2]      = p_layer->rec8_buf_uv [1][0];
     p_cu->cu_info.p_coeff[2]      = p_layer->coef_buf_uv[1][0];
-    p_layer->p_rec_tmp   [2]      = p_layer->rec_buf_uv [1][1];
+    p_layer->p_rec8_tmp   [2]      = p_layer->rec8_buf_uv [1][1];
     p_layer->p_coeff_tmp [2]      = p_layer->coef_buf_uv[1][1];
-    best->p_rec          [2]      = p_layer->rec_buf_uv [1][2];
+    best->p_rec8          [2]      = p_layer->rec8_buf_uv [1][2];
     best->p_coeff        [2]      = p_layer->coef_buf_uv[1][2];
+    } else {
+    /* Ping-pong buffer */
+    p_layer->buf_pred_inter10      = p_layer->buf_pred_inter_luma10[0];
+    p_layer->buf_pred_inter10_best = p_layer->buf_pred_inter_luma10[1];
+
+    /* init rec and coeff pointer */
+    p_cu->cu_info.p_rec10  [0]      = p_layer->rec10_buf_y [0];
+    p_cu->cu_info.p_coeff[0]      = p_layer->coef_buf_y[0];
+    p_layer->p_rec10_tmp   [0]      = p_layer->rec10_buf_y [1];
+    p_layer->p_coeff_tmp [0]      = p_layer->coef_buf_y[1];
+    best->p_rec10          [0]      = p_layer->rec10_buf_y [2];
+    best->p_coeff        [0]      = p_layer->coef_buf_y[2];
+
+    p_cu->cu_info.p_rec10  [1]      = p_layer->rec10_buf_uv [0][0];
+    p_cu->cu_info.p_coeff[1]      = p_layer->coef_buf_uv[0][0];
+    p_layer->p_rec10_tmp   [1]      = p_layer->rec10_buf_uv [0][1];
+    p_layer->p_coeff_tmp [1]      = p_layer->coef_buf_uv[0][1];
+    best->p_rec10          [1]      = p_layer->rec10_buf_uv [0][2];
+    best->p_coeff        [1]      = p_layer->coef_buf_uv[0][2];
+
+    p_cu->cu_info.p_rec10  [2]      = p_layer->rec10_buf_uv [1][0];
+    p_cu->cu_info.p_coeff[2]      = p_layer->coef_buf_uv[1][0];
+    p_layer->p_rec10_tmp   [2]      = p_layer->rec10_buf_uv [1][1];
+    p_layer->p_coeff_tmp [2]      = p_layer->coef_buf_uv[1][1];
+    best->p_rec10          [2]      = p_layer->rec10_buf_uv [1][2];
+    best->p_coeff        [2]      = p_layer->coef_buf_uv[1][2];
+    }
 
     /* init basic properties */
     p_cu->cu_info.i_cbp = 0;
@@ -255,7 +283,7 @@ void cu_init(xavs2_t *h, cu_t *p_cu, cu_info_t *best, int i_level)
     }
 #endif
 
-    /* ref_idx_1st[], ref_idx_2nd[] ÄÚ´æÁ¬Ðø */
+    /* ref_idx_1st[], ref_idx_2nd[] å†…å­˜è¿žç»­ */
     memset(p_cu->cu_info.ref_idx_1st, INVALID_REF, sizeof(p_cu->cu_info.ref_idx_1st) + sizeof(p_cu->cu_info.ref_idx_2nd));
 
     /* init position for 4 sub-CUs */
@@ -299,9 +327,15 @@ void cu_store_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best)
     cu_copy_info(best, &p_cu->cu_info);
 
     /* --- reconstructed blocks ---- */
-    XAVS2_SWAP_PTR(best->p_rec[0], p_cu->cu_info.p_rec[0]);
-    XAVS2_SWAP_PTR(best->p_rec[1], p_cu->cu_info.p_rec[1]);
-    XAVS2_SWAP_PTR(best->p_rec[2], p_cu->cu_info.p_rec[2]);
+    if (h->param->input_sample_bit_depth == 8) {
+    XAVS2_SWAP_PTR(best->p_rec8[0], p_cu->cu_info.p_rec8[0]);
+    XAVS2_SWAP_PTR(best->p_rec8[1], p_cu->cu_info.p_rec8[1]);
+    XAVS2_SWAP_PTR(best->p_rec8[2], p_cu->cu_info.p_rec8[2]);
+    } else {
+    XAVS2_SWAP_PTR(best->p_rec10[0], p_cu->cu_info.p_rec10[0]);
+    XAVS2_SWAP_PTR(best->p_rec10[1], p_cu->cu_info.p_rec10[1]);
+    XAVS2_SWAP_PTR(best->p_rec10[2], p_cu->cu_info.p_rec10[2]);
+    }
 
     /* ---- residual (coefficients) ---- */
     XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]);
@@ -391,19 +425,35 @@ void cu_copy_stored_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best)
     cu_copy_info(&p_cu->cu_info, best);
 
     //===== reconstruction values =====
-    g_funcs.pixf.copy_pp[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE,
-            best->p_rec[0], FREC_STRIDE);
-    g_funcs.pixf.copy_ss[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize,
+    if (h->param->input_sample_bit_depth == 8) {
+    g_funcs.pixf.copy_pp8[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec8[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE,
+            best->p_rec8[0], FREC_STRIDE);
+    g_funcs.pixf.copy_ss8[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize,
+            best->p_coeff[0], blocksize);
+
+    g_funcs.pixf.copy_pp8[part_idx_c](h->lcu.p_fdec8[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
+                                     best->p_rec8[1], FREC_CSTRIDE / 2);
+    g_funcs.pixf.copy_pp8[part_idx_c](h->lcu.p_fdec8[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
+                                     best->p_rec8[2], FREC_CSTRIDE / 2);
+    g_funcs.pixf.copy_ss8[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1,
+                                     best->p_coeff[1], blocksize >> 1);
+    g_funcs.pixf.copy_ss8[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1,
+                                     best->p_coeff[2], blocksize >> 1);
+    } else {
+    g_funcs.pixf.copy_pp10[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec10[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE,
+            best->p_rec10[0], FREC_STRIDE);
+    g_funcs.pixf.copy_ss10[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize,
             best->p_coeff[0], blocksize);
 
-    g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
-                                     best->p_rec[1], FREC_CSTRIDE / 2);
-    g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
-                                     best->p_rec[2], FREC_CSTRIDE / 2);
-    g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1,
+    g_funcs.pixf.copy_pp10[part_idx_c](h->lcu.p_fdec10[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
+                                     best->p_rec10[1], FREC_CSTRIDE / 2);
+    g_funcs.pixf.copy_pp10[part_idx_c](h->lcu.p_fdec10[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE,
+                                     best->p_rec10[2], FREC_CSTRIDE / 2);
+    g_funcs.pixf.copy_ss10[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1,
                                      best->p_coeff[1], blocksize >> 1);
-    g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1,
+    g_funcs.pixf.copy_ss10[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1,
                                      best->p_coeff[2], blocksize >> 1);
+    }
 
     //===============   cbp and mode   ===============
     for (j = 0; j < size_in_scu; j++) {
@@ -560,11 +610,11 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb)
     int b_available_TR  = h->tab_avail_TR[(y_TR_4x4_in_lcu << (h->i_lcu_level - B4X4_IN_BIT)) + x_TR_4x4_in_lcu];
 
     /* 2. get neighboring blocks */
-    /* ×óÉÏ */
+    /* å·¦ä¸Š */
     cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPLEFT ], xx0 - 1, yy0 - 1);
 
-    /* ×óÁÚµÄPUÐÅÏ¢ */
-    if (IS_VER_PU_PART(p_cu->cu_info.i_mode) && p_cb->x != 0) {  // CU´¹Ö±»®·ÖÎªÁ½¸öPU£¬ÇÒµ±Ç°PUÎªÓÒ±ßÒ»¸ö
+    /* å·¦é‚»çš„PUä¿¡æ¯ */
+    if (IS_VER_PU_PART(p_cu->cu_info.i_mode) && p_cb->x != 0) {  // CUåž‚ç›´åˆ’åˆ†ä¸ºä¸¤ä¸ªPUï¼Œä¸”å½“å‰PUä¸ºå³è¾¹ä¸€ä¸ª
         neighbor_inter_t *p_neighbor = neighbors + BLK_LEFT;
         p_neighbor->is_available = 1;
         // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0);
@@ -579,8 +629,8 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb)
         cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT2], xx0 - 1, yy1);
     }
 
-    /* ÉÏÁÚµÄPUÐÅÏ¢ */
-    if (IS_HOR_PU_PART(p_cu->cu_info.i_mode) && p_cb->y != 0) {  // CUË®Æ½»®·ÖÎªÁ½¸öPU£¬ÇÒµ±Ç°PUÎªÏÂ±ßÒ»¸ö
+    /* ä¸Šé‚»çš„PUä¿¡æ¯ */
+    if (IS_HOR_PU_PART(p_cu->cu_info.i_mode) && p_cb->y != 0) {  // CUæ°´å¹³åˆ’åˆ†ä¸ºä¸¤ä¸ªPUï¼Œä¸”å½“å‰PUä¸ºä¸‹è¾¹ä¸€ä¸ª
         neighbor_inter_t *p_neighbor = neighbors + BLK_TOP;
         p_neighbor->is_available = 1;
         // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0);
@@ -595,7 +645,7 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb)
         cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP2], xx1, yy0 - 1);
     }
 
-    /* ÓÒÉÏ */
+    /* å³ä¸Š */
     cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPRIGHT], b_available_TR ? xx1 + 1 : -1, yy0 - 1);
 
     cu_get_neighbor_temporal(h, &neighbors[BLK_COL], xx0, yy0);
@@ -612,9 +662,9 @@ int cu_get_mvs_for_mc(xavs2_t *h, cu_t *p_cu, int pu_idx,
 {
     int num_ref;            // number of reference frames
     int dmh_mode = p_cu->cu_info.dmh_mode;
-    int ref_1st = p_cu->cu_info.ref_idx_1st[pu_idx]; // µÚÒ»£¨Ç°Ïò»òÕßBÖ¡µ¥ÏòÔ¤²â£©ÔË¶¯Ê¸Á¿
-    int ref_2nd = p_cu->cu_info.ref_idx_2nd[pu_idx]; // µÚ¶þ£¨BÖ¡Ë«ÏòµÄºóÏò£©
-    mv_t mv_1st, mv_2nd;    // µÚÒ»£¨Ç°Ïò»òÕßBÖ¡µ¥ÏòÔ¤²â£©ºÍµÚ¶þ£¨ºóÏò£©ÔË¶¯Ê¸Á¿
+    int ref_1st = p_cu->cu_info.ref_idx_1st[pu_idx]; // ç¬¬ä¸€ï¼ˆå‰å‘æˆ–è€…Bå¸§å•å‘é¢„æµ‹ï¼‰è¿åŠ¨çŸ¢é‡
+    int ref_2nd = p_cu->cu_info.ref_idx_2nd[pu_idx]; // ç¬¬äºŒï¼ˆBå¸§åŒå‘çš„åŽå‘ï¼‰
+    mv_t mv_1st, mv_2nd;    // ç¬¬ä¸€ï¼ˆå‰å‘æˆ–è€…Bå¸§å•å‘é¢„æµ‹ï¼‰å’Œç¬¬äºŒï¼ˆåŽå‘ï¼‰è¿åŠ¨çŸ¢é‡
 
     if (h->i_type != SLICE_TYPE_B) {
         num_ref = (ref_1st != INVALID_REF) + (ref_2nd != INVALID_REF);
@@ -764,9 +814,9 @@ static INLINE
 void tu_get_dct_coeff(xavs2_t *h, coeff_t *cur_blk, int pu_size_idx, int bsx, int bsy)
 {
     if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && !h->lcu.b_2nd_rdcost_pass && bsx >= 32 && bsy >= 32) {
-        g_funcs.dctf.dct_half[pu_size_idx](cur_blk, cur_blk, bsx);
+        g_funcs.dctf.dct_half[pu_size_idx](h, cur_blk, cur_blk, bsx);
     } else {
-        g_funcs.dctf.dct[pu_size_idx](cur_blk, cur_blk, bsx);
+        g_funcs.dctf.dct[pu_size_idx](h, cur_blk, cur_blk, bsx);
     }
 }
 
@@ -796,20 +846,67 @@ static int cu_recon_chroma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t *distort
     int uv;
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
     coeff_t *cur_blk = p_enc->coeff_blk;
-    pel_t *p_pred;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_pred;
+
+    /* prediction buffer of chroma blocks */
+    if (b_intra) {
+        p_pred = p_enc->intra8_pred_c[p_cu->cu_info.i_intra_mode_c];
+    } else {
+        p_pred = p_enc->buf_pred_inter8_c;
+    }
+
+    for (uv = 0; uv < 2; uv++) {
+        pel8_t *p_fdec = p_cu->cu_info.p_rec8[uv + 1];
+        pel8_t *p_fenc = h->lcu.p_fenc8[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c;
+
+        g_funcs.pixf.sub_ps8[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE);
+
+        // DCT, quantization, inverse quantization, IDCT, and reconstruction
+        tu_get_dct_coeff(h, cur_blk, partidx_c, bsize_c, bsize_c);
+
+        qp_c = cu_get_qp(h, &p_cu->cu_info);
+#if ENABLE_WQUANT
+        qp_c += (uv == 0 ? h->param->chroma_quant_param_delta_u : h->param->chroma_quant_param_delta_v);
+#endif
+
+        qp_c = cu_get_chroma_qp(h, qp_c, uv);
+
+        num_nonzero = tu_quant_forward(h, p_aec, p_cu, cur_blk, level_c, bsize_c, bsize_c, qp_c, b_intra, 0, DC_PRED);
+        cbp_c |= (num_nonzero != 0) << (4 + uv);
+
+        if (num_nonzero) {
+            g_funcs.pixf.copy_ss8[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c);
+
+            tu_quant_inverse(h, p_cu, cur_blk, bsize_c * bsize_c, level_c, qp_c, 0);
+            g_funcs.dctf.idct[partidx_c](h, cur_blk, cur_blk, bsize_c);
+
+            g_funcs.pixf.add_ps8[partidx_c](h, p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c);
+        } else {
+            g_funcs.pixf.copy_pp8[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE);
+        }
+
+        *distortion += g_funcs.pixf.ssd8[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+
+        p_pred += (FREC_CSTRIDE >> 1);  // uvoffset
+    }
+
+    return cbp_c;
+    } else {
+    pel10_t *p_pred;
 
     /* prediction buffer of chroma blocks */
     if (b_intra) {
-        p_pred = p_enc->intra_pred_c[p_cu->cu_info.i_intra_mode_c];
+        p_pred = p_enc->intra10_pred_c[p_cu->cu_info.i_intra_mode_c];
     } else {
-        p_pred = p_enc->buf_pred_inter_c;
+        p_pred = p_enc->buf_pred_inter10_c;
     }
 
     for (uv = 0; uv < 2; uv++) {
-        pel_t *p_fdec = p_cu->cu_info.p_rec[uv + 1];
-        pel_t *p_fenc = h->lcu.p_fenc[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c;
+        pel10_t *p_fdec = p_cu->cu_info.p_rec10[uv + 1];
+        pel10_t *p_fenc = h->lcu.p_fenc10[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c;
 
-        g_funcs.pixf.sub_ps[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE);
+        g_funcs.pixf.sub_ps10[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE);
 
         // DCT, quantization, inverse quantization, IDCT, and reconstruction
         tu_get_dct_coeff(h, cur_blk, partidx_c, bsize_c, bsize_c);
@@ -825,22 +922,23 @@ static int cu_recon_chroma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t *distort
         cbp_c |= (num_nonzero != 0) << (4 + uv);
 
         if (num_nonzero) {
-            g_funcs.pixf.copy_ss[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c);
+            g_funcs.pixf.copy_ss10[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c);
 
             tu_quant_inverse(h, p_cu, cur_blk, bsize_c * bsize_c, level_c, qp_c, 0);
-            g_funcs.dctf.idct[partidx_c](cur_blk, cur_blk, bsize_c);
+            g_funcs.dctf.idct[partidx_c](h, cur_blk, cur_blk, bsize_c);
 
-            g_funcs.pixf.add_ps[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c);
+            g_funcs.pixf.add_ps10[partidx_c](h, p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c);
         } else {
-            g_funcs.pixf.copy_pp[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE);
+            g_funcs.pixf.copy_pp10[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE);
         }
 
-        *distortion += g_funcs.pixf.ssd[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+        *distortion += g_funcs.pixf.ssd10[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
 
         p_pred += (FREC_CSTRIDE >> 1);  // uvoffset
     }
 
     return cbp_c;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -853,9 +951,9 @@ int rdo_get_left_bits(xavs2_t *h, rdcost_t min_rdcost, dist_t distortion)
     double f_left_bits = ((min_rdcost - distortion) * h->f_lambda_1th) + 1;
     int left_bits;
 
-    left_bits = (int)XAVS2_CLIP3F(0.0f, 32766.0f, f_left_bits);    // clipµ½Ò»¸öºÏÀíµÄÇø¼äÄÚ
+    left_bits = (int)XAVS2_CLIP3F(0.0f, 32766.0f, f_left_bits);    // clipåˆ°ä¸€ä¸ªåˆç†çš„åŒºé—´å†…
     if (left_bits * f_lambda + distortion <= min_rdcost) {
-        left_bits++;    // ±ÜÃâ¸¡µãÊýÔËËãÎó²î£¬±£Ö¤±ÈÌØÊý´ïµ½¸ÃÖµÊ±rdcost´óÓÚmin_rdcost
+        left_bits++;    // é¿å…æµ®ç‚¹æ•°è¿ç®—è¯¯å·®ï¼Œä¿è¯æ¯”ç‰¹æ•°è¾¾åˆ°è¯¥å€¼æ—¶rdcostå¤§äºŽmin_rdcost
     }
 
     return left_bits;
@@ -873,7 +971,81 @@ int rdo_get_left_bits(xavs2_t *h, rdcost_t min_rdcost, dist_t distortion)
  * and reconstruction pixel generation of a intra luma block
  */
 static INLINE
-int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int bsx, int bsy,
+int cu_recon_intra_luma8(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel8_t *p_pred, int bsx, int bsy,
+                        int block_x, int block_y, int idx_tu, int intra_pred_mode, dist_t *distortion)
+{
+    int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS);
+    int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON);
+    int pos_x = p_cu->i_pos_x + block_x;
+    int pos_y = p_cu->i_pos_y + block_y;
+    int part_idx = PART_INDEX(bsx, bsy);
+    int w_tr = bsx >> used_wavelet;
+    int h_tr = bsy >> used_wavelet;
+    int num_non_zero;
+    int b_2nd_trans = h->param->enable_secT;
+    cu_parallel_t *p_enc  = cu_get_enc_context(h, p_cu->cu_info.i_level);
+    pel8_t      *p_fenc    = h->lcu.p_fenc8[0] + pos_y * FENC_STRIDE + pos_x;
+    pel8_t      *p_fdec    = p_cu->cu_info.p_rec8[0] + block_y * FREC_STRIDE + block_x;
+    coeff_t    *cur_blk   = p_enc->coeff_blk;
+    coeff_t    *p_coeff_y = p_cu->cu_info.p_coeff[0] + (idx_tu << ((p_cu->cu_info.i_level - 1) << 1));
+    int b_top  = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_TOP);
+    int b_left = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_LEFT);
+
+    // get prediction and prediction error
+    g_funcs.pixf.sub_ps8[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx);
+
+    // block transform
+    if (part_idx == LUMA_4x4) {
+        if (b_2nd_trans) {
+            g_funcs.dctf.transform_4x4_2nd(h, cur_blk, w_tr);
+        } else {
+            g_funcs.dctf.dct[LUMA_4x4](h, cur_blk, cur_blk, 4);     /* 4x4 dct */
+        }
+    } else {
+        tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr);
+
+        if (b_2nd_trans) {
+            g_funcs.dctf.transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left);
+        }
+    }
+
+    // quantization
+    num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_tu_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 1, 1, intra_pred_mode);
+
+    if (num_non_zero) {
+        g_funcs.pixf.copy_ss8[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr);
+
+        // inverse quantization
+        tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_tu_level, cu_get_qp(h, &p_cu->cu_info), 1);
+
+        // inverse transform
+        if (part_idx == LUMA_4x4) {
+            if (b_2nd_trans) {
+                g_funcs.dctf.inv_transform_4x4_2nd(h, cur_blk, w_tr);
+            } else {
+                g_funcs.dctf.idct[LUMA_4x4](h, cur_blk, cur_blk, 4);    /* 4x4 idct */
+            }
+        } else {
+            if (b_2nd_trans) {
+                g_funcs.dctf.inv_transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left);
+            }
+
+            g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr);
+        }
+
+        g_funcs.pixf.add_ps8[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx);
+    } else {
+        g_funcs.pixf.copy_pp8[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx);
+    }
+
+    // get distortion (SSD) of current block
+    *distortion = g_funcs.pixf.ssd8[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+
+    return num_non_zero;
+}
+
+static INLINE
+int cu_recon_intra_luma10(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel10_t *p_pred, int bsx, int bsy,
                         int block_x, int block_y, int idx_tu, int intra_pred_mode, dist_t *distortion)
 {
     int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS);
@@ -886,22 +1058,22 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int
     int num_non_zero;
     int b_2nd_trans = h->param->enable_secT;
     cu_parallel_t *p_enc  = cu_get_enc_context(h, p_cu->cu_info.i_level);
-    pel_t      *p_fenc    = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x;
-    pel_t      *p_fdec    = p_cu->cu_info.p_rec[0] + block_y * FREC_STRIDE + block_x;
+    pel10_t      *p_fenc    = h->lcu.p_fenc10[0] + pos_y * FENC_STRIDE + pos_x;
+    pel10_t      *p_fdec    = p_cu->cu_info.p_rec10[0] + block_y * FREC_STRIDE + block_x;
     coeff_t    *cur_blk   = p_enc->coeff_blk;
     coeff_t    *p_coeff_y = p_cu->cu_info.p_coeff[0] + (idx_tu << ((p_cu->cu_info.i_level - 1) << 1));
     int b_top  = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_TOP);
     int b_left = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_LEFT);
 
     // get prediction and prediction error
-    g_funcs.pixf.sub_ps[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx);
+    g_funcs.pixf.sub_ps10[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx);
 
     // block transform
     if (part_idx == LUMA_4x4) {
         if (b_2nd_trans) {
-            g_funcs.dctf.transform_4x4_2nd(cur_blk, w_tr);
+            g_funcs.dctf.transform_4x4_2nd(h, cur_blk, w_tr);
         } else {
-            g_funcs.dctf.dct[LUMA_4x4](cur_blk, cur_blk, 4);     /* 4x4 dct */
+            g_funcs.dctf.dct[LUMA_4x4](h, cur_blk, cur_blk, 4);     /* 4x4 dct */
         }
     } else {
         tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr);
@@ -915,7 +1087,7 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int
     num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_tu_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 1, 1, intra_pred_mode);
 
     if (num_non_zero) {
-        g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr);
+        g_funcs.pixf.copy_ss10[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr);
 
         // inverse quantization
         tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_tu_level, cu_get_qp(h, &p_cu->cu_info), 1);
@@ -923,25 +1095,25 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int
         // inverse transform
         if (part_idx == LUMA_4x4) {
             if (b_2nd_trans) {
-                g_funcs.dctf.inv_transform_4x4_2nd(cur_blk, w_tr);
+                g_funcs.dctf.inv_transform_4x4_2nd(h, cur_blk, w_tr);
             } else {
-                g_funcs.dctf.idct[LUMA_4x4](cur_blk, cur_blk, 4);    /* 4x4 idct */
+                g_funcs.dctf.idct[LUMA_4x4](h, cur_blk, cur_blk, 4);    /* 4x4 idct */
             }
         } else {
             if (b_2nd_trans) {
                 g_funcs.dctf.inv_transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left);
             }
 
-            g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr);
+            g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr);
         }
 
-        g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx);
+        g_funcs.pixf.add_ps10[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx);
     } else {
-        g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx);
+        g_funcs.pixf.copy_pp10[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx);
     }
 
     // get distortion (SSD) of current block
-    *distortion = g_funcs.pixf.ssd[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+    *distortion = g_funcs.pixf.ssd10[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
 
     return num_non_zero;
 }
@@ -992,7 +1164,7 @@ void xavs2_get_mpms(xavs2_t *h, cu_t *p_cu, int blockidx, int pos_y, int pos_x,
 
 
 /* ---------------------------------------------------------------------------
- * ¼ì²éÖ¡ÄÚPU»®·Ö·½Ê½µÄRDCost²¢¸üÐÂ×îÓÅµÄPU»®·Ö·½Ê½
+ * æ£€æŸ¥å¸§å†…PUåˆ’åˆ†æ–¹å¼çš„RDCostå¹¶æ›´æ–°æœ€ä¼˜çš„PUåˆ’åˆ†æ–¹å¼
  */
 static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best, int mode, rdcost_t *min_rdcost)
 {
@@ -1002,8 +1174,6 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     rdcost_t rdcost_luma = 0;
     rdcost_t rdcost = MAX_COST;
     rdcost_t min_mode_rdcost = MAX_COST;
-    pel_t *rec_bak_y = best->p_rec[0];
-    pel_t *p_best_part[4];
     int blockidx;
     int num_luma_block = mode != PRED_I_2Nx2N ? 4 : 1;
     int b_need_swap_buf = 0;
@@ -1011,10 +1181,10 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     int pix_y_c = p_cu->i_pos_y >> CHROMA_V_SHIFT;
     intra_candidate_t *p_candidates = p_layer->intra_candidates;
 
-    /* È·¶¨PU»®·ÖÀàÐÍ */
+    /* ç¡®å®šPUåˆ’åˆ†ç±»åž‹ */
     cu_init_pu_intra(h, &p_cu->cu_info, level, mode);
 
-    /* È·¶¨TU»®·ÖÀàÐÍ */
+    /* ç¡®å®šTUåˆ’åˆ†ç±»åž‹ */
     cu_set_tu_split_type(h, &p_cu->cu_info, mode != PRED_I_2Nx2N);
 
     h->copy_aec_state_rdo(&p_layer->cs_rdo, p_aec);
@@ -1022,6 +1192,9 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
 
     p_cu->intra_avail = (uint8_t)xavs2_intra_get_cu_neighbors(h, p_cu, p_cu->i_pix_x, p_cu->i_pix_y, p_cu->i_size);
 
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *rec_bak_y = best->p_rec8[0];
+    pel8_t *p_best_part[4];
     /* 1, intra luma prediction and mode decision */
     for (blockidx = 0; blockidx < num_luma_block; blockidx++) {
         int mpm[2];  // most probable modes (MPMs) for current luma block
@@ -1037,7 +1210,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         int best_mode = 0;
         int best_pmode = 0;
         int best_cbp = 0;
-        pel_t *p_fenc = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x;
+        pel8_t *p_fenc = h->lcu.p_fenc8[0] + pos_y * FENC_STRIDE + pos_x;
         rdcost_t best_rdcost = MAX_COST;
         int i;
         int num_for_rdo;
@@ -1052,7 +1225,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         }
 
         /* conduct prediction and get intra prediction direction candidates for RDO */
-        num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma(h, p_cu, p_candidates, p_fenc, mpm, blockidx,
+        num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma8(h, p_cu, p_candidates, p_fenc, mpm, blockidx,
                       block_x, block_y, block_w, block_h);
 
         // store the coding state
@@ -1061,16 +1234,16 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         /* RDO */
         for (i = 0; i < num_for_rdo; i++) {
             //rdcost_t rdcost;
-            dist_t dist_curr;     // µ±Ç°ÁÁ¶ÈÖ¡ÄÚ¿éµÄÊ§Õæ
-            int rate_curr = 0; // µ±Ç°ÁÁ¶ÈÖ¡ÄÚ¿éµÄÂëÂÊ£¨±ÈÌØÊý£©
+            dist_t dist_curr;     // å½“å‰äº®åº¦å¸§å†…å—çš„å¤±çœŸ
+            int rate_curr = 0; // å½“å‰äº®åº¦å¸§å†…å—çš„ç çŽ‡ï¼ˆæ¯”ç‰¹æ•°ï¼‰
             int Mode = p_candidates[i].mode;
-            pel_t *p_pred = p_enc->intra_pred[Mode];
+            pel8_t *p_pred = p_enc->intra8_pred[Mode];
 
             // get and check rate_chroma-distortion cost
             int mode_idx_aec = (mpm[0] == Mode) ? -2 : ((mpm[1] == Mode) ? -1 : (mpm[0] > Mode ? Mode : (mpm[1] > Mode ? Mode - 1 : Mode - 2)));
             int num_nonzero;
 
-            num_nonzero = cu_recon_intra_luma(h, p_aec, p_cu, p_pred,
+            num_nonzero = cu_recon_intra_luma8(h, p_aec, p_cu, p_pred,
                                               block_w, block_h, block_x, block_y,
                                               blockidx, Mode, &dist_curr);
             num_nonzero = !!num_nonzero;
@@ -1098,7 +1271,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
 
             // choose best mode
             if (rdcost < best_rdcost) {
-                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
                 XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
 
                 // set best mode update minimum cost
@@ -1123,14 +1296,14 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         /* change the coding state to BEST */
         if (best_rate < INT_MAX) {
             if (p_cu->cu_info.i_mode != PRED_I_2Nx2N) {
-                g_funcs.pixf.copy_pp[PART_INDEX(block_w, block_h)](h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE,
-                        p_layer->p_rec_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE);
+                g_funcs.pixf.copy_pp8[PART_INDEX(block_w, block_h)](h->lcu.p_fdec8[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE,
+                        p_layer->p_rec8_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE);
             }
 
             /* copy coefficients and reconstructed data for best mode */
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
             XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
-            p_best_part[blockidx] = p_cu->cu_info.p_rec[0];
+            p_best_part[blockidx] = p_cu->cu_info.p_rec8[0];
 
             /* set intra mode prediction */
             p_cu->cu_info.pred_intra_modes[blockidx] = (int8_t)best_pmode;
@@ -1140,29 +1313,29 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
             h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu);
         }
 
-        /* ±£´æ×îÓÅÄ£Ê½µÄ×´Ì¬£ºÊ§Õæ¡¢ÁÁ¶È·ÖÁ¿±ÈÌØÊý£¨ÅÅ³ýµôÁÁ¶ÈÔ¤²âÄ£Ê½£©£¬CBP */
+        /* ä¿å­˜æœ€ä¼˜æ¨¡å¼çš„çŠ¶æ€ï¼šå¤±çœŸã€äº®åº¦åˆ†é‡æ¯”ç‰¹æ•°ï¼ˆæŽ’é™¤æŽ‰äº®åº¦é¢„æµ‹æ¨¡å¼ï¼‰ï¼ŒCBP */
         rdcost_luma += best_dist + h->f_lambda_mode * best_rate;
         p_cu->cu_info.i_cbp |= (best_cbp) << blockidx;
 
-        /* ÁÁ¶È¿éRDOµÄÌáÇ°ÖÕÖ¹ */
+        /* äº®åº¦å—RDOçš„æå‰ç»ˆæ­¢ */
         if (rdcost_luma >= *min_rdcost) {
             p_layer->mode_rdcost[mode] = MAX_COST;        /* set the cost for SDIP fast algorithm */
             h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
-            return;  // ÁÁ¶È¿éµÄ×îÓÅrdcostÒÑ¾­³¬¹ýµ±Ç°×îÓÅÖµ£¬Í£Ö¹ºóÐøÉ«¶È¿éµÄÄ£Ê½±éÀú
+            return;  // äº®åº¦å—çš„æœ€ä¼˜rdcostå·²ç»è¶…è¿‡å½“å‰æœ€ä¼˜å€¼ï¼Œåœæ­¢åŽç»­è‰²åº¦å—çš„æ¨¡å¼éåŽ†
         }
     }
     p_cu->feature.rdcost_luma = rdcost_luma;
 
     /* 2, store best luma reconstruction pixels */
     for (blockidx = 0; blockidx < num_luma_block; blockidx++) {
-        if (p_best_part[blockidx] != p_cu->cu_info.p_rec[0]) {
+        if (p_best_part[blockidx] != p_cu->cu_info.p_rec8[0]) {
             int offset = p_cu->cu_info.cb[blockidx].y * FREC_STRIDE + p_cu->cu_info.cb[blockidx].x;
             int offset_coeff = blockidx << ((p_cu->cu_info.i_level - 1) << 1);
             int w_tr = p_cu->cu_info.cb[0].w;
             int h_tr = p_cu->cu_info.cb[0].h;
             int part_idx = PART_INDEX(w_tr, h_tr);
-            g_funcs.pixf.copy_pp[part_idx](p_cu->cu_info.p_rec[0]   + offset, FREC_STRIDE, p_layer->p_rec_tmp[0]  + offset, p_cu->i_size);
-            g_funcs.pixf.copy_ss[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr);
+            g_funcs.pixf.copy_pp8[part_idx](p_cu->cu_info.p_rec8[0]   + offset, FREC_STRIDE, p_layer->p_rec8_tmp[0]  + offset, p_cu->i_size);
+            g_funcs.pixf.copy_ss8[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr);
         }
     }
 
@@ -1177,13 +1350,13 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         num_rdo_chroma_mode = h->get_intra_candidates_chroma(h, p_cu, level - 1, pix_y_c, pix_x_c, p_candidates);
 
         for (idx_chroma_mode = 0; idx_chroma_mode < num_rdo_chroma_mode; idx_chroma_mode++) {
-            dist_t dist_chroma = 0;  // É«¶È¿éµÄÖ¸Õë
+            dist_t dist_chroma = 0;  // è‰²åº¦å—çš„æŒ‡é’ˆ
             int rate_chroma = 0;
             int bits_left;
             int predmode_c = p_candidates[idx_chroma_mode].mode;
             int cbp_c;
 
-            /* Ìø¹ýÉ«¶È·ÖÁ¿µÚ¶þ´Îµ÷ÓÃ¹ý³ÌÖÐµÄÄ£Ê½Ñ¡Ôñ£¬Ö±½ÓÑ¡µ½×îÓÅÄ£Ê½Íê³ÉRDOQ */
+            /* è·³è¿‡è‰²åº¦åˆ†é‡ç¬¬äºŒæ¬¡è°ƒç”¨è¿‡ç¨‹ä¸­çš„æ¨¡å¼é€‰æ‹©ï¼Œç›´æŽ¥é€‰åˆ°æœ€ä¼˜æ¨¡å¼å®ŒæˆRDOQ */
             if ((h->param->i_rdoq_level == RDOQ_CU_LEVEL && h->lcu.b_enable_rdoq) && predmode_c != best->i_intra_mode_c) {
                 continue;
             }
@@ -1192,7 +1365,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
             }
             p_cu->cu_info.i_intra_mode_c = (int8_t)predmode_c;
 
-            /* Íê³ÉRDO¹ý³ÌµÄÉ«¶È¿éµÄÖØ¹¹¹ý³Ì£¨±ä»»¡¢Á¿»¯¡¢·´±ä»»·´Á¿»¯¼°ÇóÖØ¹¹Öµ£© */
+            /* å®ŒæˆRDOè¿‡ç¨‹çš„è‰²åº¦å—çš„é‡æž„è¿‡ç¨‹ï¼ˆå˜æ¢ã€é‡åŒ–ã€åå˜æ¢åé‡åŒ–åŠæ±‚é‡æž„å€¼ï¼‰ */
             cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma);
 
             p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp_luma + cbp_c);
@@ -1257,153 +1430,517 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);  /* revert to initial AEC context */
 
     /* 4, confirm the buffer pointers and record the best information */
-    if (best->p_rec[0] == rec_bak_y && b_need_swap_buf) {
-        XAVS2_SWAP_PTR(best->p_rec[0],   p_cu->cu_info.p_rec[0]);
+    if (best->p_rec8[0] == rec_bak_y && b_need_swap_buf) {
+        XAVS2_SWAP_PTR(best->p_rec8[0],   p_cu->cu_info.p_rec8[0]);
         XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]);
     }
 
     p_layer->mode_rdcost[mode] = min_mode_rdcost;    /* store the cost for SDIP fast algorithm */
-}
+    } else {
+    pel10_t *rec_bak_y = best->p_rec10[0];
+    pel10_t *p_best_part[4];
+    /* 1, intra luma prediction and mode decision */
+    for (blockidx = 0; blockidx < num_luma_block; blockidx++) {
+        int mpm[2];  // most probable modes (MPMs) for current luma block
+        int block_x = p_cu->cu_info.cb[blockidx].x;
+        int block_y = p_cu->cu_info.cb[blockidx].y;
+        int block_w = p_cu->cu_info.cb[blockidx].w;
+        int block_h = p_cu->cu_info.cb[blockidx].h;
+        int pos_x = p_cu->i_pos_x + block_x;
+        int pos_y = p_cu->i_pos_y + block_y;
+        int b4x4_x = (p_cu->i_pix_x + block_x) >> MIN_PU_SIZE_IN_BIT;
+        dist_t best_dist = MAX_DISTORTION;
+        int best_rate = INT_MAX;
+        int best_mode = 0;
+        int best_pmode = 0;
+        int best_cbp = 0;
+        pel10_t *p_fenc = h->lcu.p_fenc10[0] + pos_y * FENC_STRIDE + pos_x;
+        rdcost_t best_rdcost = MAX_COST;
+        int i;
+        int num_for_rdo;
+        p_candidates = p_layer->intra_candidates;   // candidate list, reserving the cost
 
-//#if OPT_BYPASS_SDIP
-/* ---------------------------------------------------------------------------
- * SDIP fast
- */
-static ALWAYS_INLINE int sdip_early_bypass(xavs2_t *h, cu_layer_t *p_layer, int i_mode)
-{
-    UNUSED_PARAMETER(h);
-    return i_mode == PRED_I_nx2N && (p_layer->mode_rdcost[PRED_I_2Nxn] < p_layer->mode_rdcost[PRED_I_2Nx2N] * 0.9);
-}
-//#endif
+        /* init */
+        xavs2_get_mpms(h, p_cu, blockidx, pos_y, b4x4_x, mpm);
 
-/**
- * ===========================================================================
- * local function defines (inter)
- * ===========================================================================
- */
+        for (i = 0; i < INTRA_MODE_NUM_FOR_RDO; i++) {
+            p_candidates[i].mode = 0;
+            p_candidates[i].cost = MAX_COST;
+        }
 
-//#if OPT_FAST_ZBLOCK || OPT_ECU
-static const int tab_th_zero_block_sad[][5] = {
-    {    7,   19,   72,  281,  1115 }, {    7,   19,   73,  281,  1116 }, {    7,   20,   73,  282,  1118 },
-    {    8,   20,   74,  283,  1120 }, {    8,   20,   74,  284,  1122 }, {    8,   20,   75,  285,  1124 },
-    {    8,   21,   75,  286,  1126 }, {    8,   21,   76,  288,  1129 }, {    9,   21,   77,  289,  1132 },
-    {    9,   22,   77,  291,  1135 }, {    9,   22,   78,  292,  1138 }, {   10,   23,   79,  294,  1142 },
-    {   10,   23,   80,  296,  1146 }, {   10,   24,   81,  298,  1150 }, {   11,   24,   82,  301,  1155 },
-    {   11,   25,   84,  303,  1160 }, {   12,   26,   85,  306,  1166 }, {   12,   26,   87,  309,  1172 },
-    {   13,   27,   88,  312,  1179 }, {   13,   28,   90,  316,  1186 }, {   14,   29,   92,  320,  1194 },
-    {   15,   30,   94,  325,  1203 }, {   15,   31,   97,  329,  1213 }, {   16,   33,   99,  334,  1223 },
-    {   17,   34,  102,  340,  1235 }, {   18,   36,  105,  346,  1247 }, {   20,   37,  109,  353,  1260 },
-    {   21,   39,  112,  360,  1275 }, {   22,   41,  116,  368,  1292 }, {   24,   43,  121,  377,  1309 },
-    {   25,   46,  125,  386,  1328 }, {   27,   48,  131,  397,  1349 }, {   29,   51,  136,  408,  1372 },
-    {   31,   54,  142,  420,  1397 }, {   33,   58,  149,  434,  1424 }, {   36,   61,  156,  448,  1453 },
-    {   38,   65,  164,  464,  1485 }, {   41,   70,  173,  482,  1520 }, {   45,   74,  183,  501,  1559 },
-    {   48,   79,  193,  521,  1600 }, {   52,   85,  204,  544,  1646 }, {   56,   91,  217,  569,  1696 },
-    {   61,   98,  230,  596,  1750 }, {   66,  105,  245,  625,  1809 }, {   71,  113,  261,  657,  1873 },
-    {   77,  122,  278,  692,  1944 }, {   83,  132,  297,  729,  2020 }, {   90,  142,  318,  771,  2104 },
-    {   98,  153,  341,  816,  2195 }, {  106,  166,  365,  865,  2294 }, {  116,  179,  392,  919,  2403 },
-    {  126,  194,  422,  978,  2521 }, {  136,  210,  454, 1042,  2649 }, {  148,  227,  488, 1111,  2790 },
-    {  161,  246,  526, 1187,  2943 }, {  175,  267,  568, 1270,  3110 }, {  191,  290,  613, 1360,  3292 },
-    {  207,  314,  662, 1459,  3491 }, {  225,  341,  716, 1566,  3707 }, {  245,  370,  775, 1683,  3944 },
-    {  267,  402,  839, 1811,  4201 }, {  291,  437,  909, 1950,  4482 }, {  316,  475,  985, 2102,  4788 },
-    {  345,  517, 1068, 2268,  5123 }, {  375,  562, 1158, 2448,  5487 }, {  412,  617, 1268, 2667,  5928 },
-    {  445,  665, 1364, 2860,  6317 }, {  485,  724, 1482, 3094,  6790 }, {  528,  788, 1610, 3350,  7305 },
-    {  576,  858, 1749, 3628,  7867 }, {  631,  939, 1912, 3954,  8524 }, {  687, 1022, 2078, 4285,  9192 },
-    {  748, 1113, 2259, 4647,  9920 }, {  812, 1206, 2446, 5019, 10671 }, {  884, 1313, 2661, 5448, 11537 },
-    {  964, 1431, 2895, 5917, 12482 }, { 1047, 1553, 3140, 6406, 13469 }, { 1145, 1698, 3430, 6985, 14636 },
-    { 1248, 1850, 3735, 7592, 15862 }, { 1357, 2011, 4055, 8233, 17154 }
-};
+        /* conduct prediction and get intra prediction direction candidates for RDO */
+        num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma10(h, p_cu, p_candidates, p_fenc, mpm, blockidx,
+                      block_x, block_y, block_w, block_h);
 
-/* ---------------------------------------------------------------------------
- */
-static ALWAYS_INLINE
-bool_t isZeroCuFast(xavs2_t *h, cu_t *p_cu)
-{
-    int i_level = p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT;
-    int i_qp = cu_get_qp(h, &p_cu->cu_info);
-    int thres_satd = (int)(tab_th_zero_block_sad[i_qp][i_level] * h->param->factor_zero_block);
+        // store the coding state
+        h->copy_aec_state_rdo(&p_enc->cs_pu_init, p_aec);
 
-    return p_cu->sum_satd < thres_satd;
-}
-//#endif
+        /* RDO */
+        for (i = 0; i < num_for_rdo; i++) {
+            //rdcost_t rdcost;
+            dist_t dist_curr;     // å½“å‰äº®åº¦å¸§å†…å—çš„å¤±çœŸ
+            int rate_curr = 0; // å½“å‰äº®åº¦å¸§å†…å—çš„ç çŽ‡ï¼ˆæ¯”ç‰¹æ•°ï¼‰
+            int Mode = p_candidates[i].mode;
+            pel10_t *p_pred = p_enc->intra10_pred[Mode];
 
-/* ---------------------------------------------------------------------------
- * int scrFlag = 0;             // 0=noSCR, 1=strongSCR, 2=jmSCR
- */
-static INLINE int
-tu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
-                    int i_level, int8_t *cbp, int blockidx, coeff_t *cur_blk,
-                    int x_pu, int y_pu, int w_pu, int h_pu)
-{
-    cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
-    int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS);
-    int part_idx = PART_INDEX(w_pu, h_pu);
-    int w_tr = w_pu >> used_wavelet;
-    int h_tr = h_pu >> used_wavelet;
-    int num_non_zero = 0;
-    pel_t *p_fdec = p_cu->cu_info.p_rec[0] + y_pu * FREC_STRIDE + x_pu;
-    pel_t *p_pred = p_layer->buf_pred_inter + y_pu * FREC_STRIDE + x_pu;
-    coeff_t *coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1));
+            // get and check rate_chroma-distortion cost
+            int mode_idx_aec = (mpm[0] == Mode) ? -2 : ((mpm[1] == Mode) ? -1 : (mpm[0] > Mode ? Mode : (mpm[1] > Mode ? Mode - 1 : Mode - 2)));
+            int num_nonzero;
 
-    tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr);
+            num_nonzero = cu_recon_intra_luma10(h, p_aec, p_cu, p_pred,
+                                              block_w, block_h, block_x, block_y,
+                                              blockidx, Mode, &dist_curr);
+            num_nonzero = !!num_nonzero;
+            {
+                int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS);
+                int w_tr = block_w >> used_wavelet;
+                int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON) - used_wavelet;
+                int rate_luma_mode;
+                coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1));
 
-    num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_level, w_tr, h_tr,
-                                    cu_get_qp(h, &p_cu->cu_info), 0, 1, DC_PRED);
+                // get rate for intra prediction mode
+                rate_luma_mode = p_aec->binary.write_intra_pred_mode(p_aec, mode_idx_aec);
 
-    if (num_non_zero != 0) {
-        *cbp |= (1 << blockidx);    // Ö¸¶¨Î»ÉèÖÃÎª 1
-        g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr);
+                // get rate for luminance coefficients
+                if (num_nonzero) {
+                    int bits_left = rdo_get_left_bits(h, best_rdcost, dist_curr) - rate_luma_mode;
+                    rate_curr = p_aec->binary.est_luma_block_coeff(h, p_aec, p_cu, p_coeff_y, &p_enc->runlevel, i_tu_level, xavs2_log2u(w_tr),
+                                1, Mode, bits_left);
+                    rate_luma_mode += rate_curr;
+                }
 
-        tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1);
-        g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr);
+                // calculate RD-cost and return it
+                rdcost = dist_curr + h->f_lambda_mode * rate_luma_mode;
+            }
 
-        g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu);
+            // choose best mode
+            if (rdcost < best_rdcost) {
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+
+                // set best mode update minimum cost
+                best_dist = dist_curr;
+                best_rate = rate_curr;
+                best_rdcost = rdcost;
+                best_mode = Mode;
+                best_pmode = mode_idx_aec;
+                best_cbp = num_nonzero;   // flag if dct-coefficients must be coded
+                h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec);
+            }
+
+            h->copy_aec_state_rdo(p_aec, &p_enc->cs_pu_init);
+
+            if (IS_ALG_ENABLE(OPT_ET_RDO_INTRA_L)) {
+                if (rdcost > best_rdcost * 1.2) {
+                    break;
+                }
+            }
+        }   // for (i = 0; i < num_for_rdo; i++)
+
+        /* change the coding state to BEST */
+        if (best_rate < INT_MAX) {
+            if (p_cu->cu_info.i_mode != PRED_I_2Nx2N) {
+                g_funcs.pixf.copy_pp10[PART_INDEX(block_w, block_h)](h->lcu.p_fdec10[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE,
+                        p_layer->p_rec10_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE);
+            }
+
+            /* copy coefficients and reconstructed data for best mode */
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+            p_best_part[blockidx] = p_cu->cu_info.p_rec10[0];
+
+            /* set intra mode prediction */
+            p_cu->cu_info.pred_intra_modes[blockidx] = (int8_t)best_pmode;
+            p_cu->cu_info.real_intra_modes[blockidx] = (int8_t)best_mode;
+
+            /* copy coding state */
+            h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu);
+        }
+
+        /* ä¿å­˜æœ€ä¼˜æ¨¡å¼çš„çŠ¶æ€ï¼šå¤±çœŸã€äº®åº¦åˆ†é‡æ¯”ç‰¹æ•°ï¼ˆæŽ’é™¤æŽ‰äº®åº¦é¢„æµ‹æ¨¡å¼ï¼‰ï¼ŒCBP */
+        rdcost_luma += best_dist + h->f_lambda_mode * best_rate;
+        p_cu->cu_info.i_cbp |= (best_cbp) << blockidx;
+
+        /* äº®åº¦å—RDOçš„æå‰ç»ˆæ­¢ */
+        if (rdcost_luma >= *min_rdcost) {
+            p_layer->mode_rdcost[mode] = MAX_COST;        /* set the cost for SDIP fast algorithm */
+            h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
+            return;  // äº®åº¦å—çš„æœ€ä¼˜rdcostå·²ç»è¶…è¿‡å½“å‰æœ€ä¼˜å€¼ï¼Œåœæ­¢åŽç»­è‰²åº¦å—çš„æ¨¡å¼éåŽ†
+        }
+    }
+    p_cu->feature.rdcost_luma = rdcost_luma;
+
+    /* 2, store best luma reconstruction pixels */
+    for (blockidx = 0; blockidx < num_luma_block; blockidx++) {
+        if (p_best_part[blockidx] != p_cu->cu_info.p_rec10[0]) {
+            int offset = p_cu->cu_info.cb[blockidx].y * FREC_STRIDE + p_cu->cu_info.cb[blockidx].x;
+            int offset_coeff = blockidx << ((p_cu->cu_info.i_level - 1) << 1);
+            int w_tr = p_cu->cu_info.cb[0].w;
+            int h_tr = p_cu->cu_info.cb[0].h;
+            int part_idx = PART_INDEX(w_tr, h_tr);
+            g_funcs.pixf.copy_pp10[part_idx](p_cu->cu_info.p_rec10[0]   + offset, FREC_STRIDE, p_layer->p_rec10_tmp[0]  + offset, p_cu->i_size);
+            g_funcs.pixf.copy_ss10[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr);
+        }
+    }
+
+    /* 3, Chroma mode decision and CU mode updating */
+    if (h->param->chroma_format != CHROMA_400) {
+        int lmode;
+        int num_rdo_chroma_mode;
+        int idx_chroma_mode;
+        int tmp_cbp_luma = p_cu->cu_info.i_cbp;
+
+        lmode = tab_intra_mode_luma2chroma[p_cu->cu_info.real_intra_modes[0]];
+        num_rdo_chroma_mode = h->get_intra_candidates_chroma(h, p_cu, level - 1, pix_y_c, pix_x_c, p_candidates);
+
+        for (idx_chroma_mode = 0; idx_chroma_mode < num_rdo_chroma_mode; idx_chroma_mode++) {
+            dist_t dist_chroma = 0;  // è‰²åº¦å—çš„æŒ‡é’ˆ
+            int rate_chroma = 0;
+            int bits_left;
+            int predmode_c = p_candidates[idx_chroma_mode].mode;
+            int cbp_c;
+
+            /* è·³è¿‡è‰²åº¦åˆ†é‡ç¬¬äºŒæ¬¡è°ƒç”¨è¿‡ç¨‹ä¸­çš„æ¨¡å¼é€‰æ‹©ï¼Œç›´æŽ¥é€‰åˆ°æœ€ä¼˜æ¨¡å¼å®ŒæˆRDOQ */
+            if ((h->param->i_rdoq_level == RDOQ_CU_LEVEL && h->lcu.b_enable_rdoq) && predmode_c != best->i_intra_mode_c) {
+                continue;
+            }
+            if (predmode_c != DM_PRED_C && predmode_c == lmode) {
+                continue;
+            }
+            p_cu->cu_info.i_intra_mode_c = (int8_t)predmode_c;
+
+            /* å®ŒæˆRDOè¿‡ç¨‹çš„è‰²åº¦å—çš„é‡æž„è¿‡ç¨‹ï¼ˆå˜æ¢ã€é‡åŒ–ã€åå˜æ¢åé‡åŒ–åŠæ±‚é‡æž„å€¼ï¼‰ */
+            cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma);
+
+            p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp_luma + cbp_c);
+
+            /* ------- GET RATE -------- */
+            rate_chroma = p_aec->binary.est_cu_header(h, p_aec, p_cu);
+#if ENABLE_RATE_CONTROL_CU
+            rate_chroma += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant);
+#else
+            rate_chroma += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h);
+#endif
+
+            bits_left = rdo_get_left_bits(h, *min_rdcost - rdcost_luma, dist_chroma);
+
+            if (p_cu->cu_info.i_cbp & (1 << 4)) {
+                int cur_bits_left = bits_left - rate_chroma;
+                rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[1], &p_enc->runlevel, level - 1, cur_bits_left);
+            }
+            if (p_cu->cu_info.i_cbp & (1 << 5)) {
+                int cur_bits_left = bits_left - rate_chroma;
+                rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[2], &p_enc->runlevel, level - 1, cur_bits_left);
+            }
+
+            rdcost = dist_chroma + h->f_lambda_mode * rate_chroma + rdcost_luma;
+
+            min_mode_rdcost = XAVS2_MIN(rdcost, min_mode_rdcost);
+
+            if (rdcost < *min_rdcost) {
+                *min_rdcost = rdcost;
+                h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec);    /* store coding state for the best mode */
+                cu_store_parameters(h, p_cu, best);
+                b_need_swap_buf = 1;
+            }
+
+            h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu);   /* revert to AEC context of best Luma mode */
+
+            if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) {
+                if (rdcost > *min_rdcost * 2 ||
+                    cbp_c == 0) {
+                    break;
+                }
+            }
+        }
+    } else {   /* YUV400 */
+        /* ------- GET RATE -------- */
+        int rate_hdr = p_aec->binary.est_cu_header(h, p_aec, p_cu);
+#if ENABLE_RATE_CONTROL_CU
+        rate_hdr += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant);
+#else
+        rate_hdr += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h);
+#endif
+        rdcost = h->f_lambda_mode * rate_hdr + rdcost_luma;
+
+        if (rdcost < *min_rdcost) {
+            *min_rdcost = rdcost;
+            h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec);    /* store coding state for the best mode */
+            cu_store_parameters(h, p_cu, best);
+            b_need_swap_buf = 1;
+        }
+    }
+
+    h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);  /* revert to initial AEC context */
+
+    /* 4, confirm the buffer pointers and record the best information */
+    if (best->p_rec10[0] == rec_bak_y && b_need_swap_buf) {
+        XAVS2_SWAP_PTR(best->p_rec10[0],   p_cu->cu_info.p_rec10[0]);
+        XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]);
+    }
+
+    p_layer->mode_rdcost[mode] = min_mode_rdcost;    /* store the cost for SDIP fast algorithm */
+    }
+}
+
+//#if OPT_BYPASS_SDIP
+/* ---------------------------------------------------------------------------
+ * SDIP fast
+ */
+static ALWAYS_INLINE int sdip_early_bypass(xavs2_t *h, cu_layer_t *p_layer, int i_mode)
+{
+    UNUSED_PARAMETER(h);
+    return i_mode == PRED_I_nx2N && (p_layer->mode_rdcost[PRED_I_2Nxn] < p_layer->mode_rdcost[PRED_I_2Nx2N] * 0.9);
+}
+//#endif
+
+/**
+ * ===========================================================================
+ * local function defines (inter)
+ * ===========================================================================
+ */
+
+//#if OPT_FAST_ZBLOCK || OPT_ECU
+static const int tab_th_zero_block_sad[][5] = {
+    {    7,   19,   72,  281,  1115 }, {    7,   19,   73,  281,  1116 }, {    7,   20,   73,  282,  1118 },
+    {    8,   20,   74,  283,  1120 }, {    8,   20,   74,  284,  1122 }, {    8,   20,   75,  285,  1124 },
+    {    8,   21,   75,  286,  1126 }, {    8,   21,   76,  288,  1129 }, {    9,   21,   77,  289,  1132 },
+    {    9,   22,   77,  291,  1135 }, {    9,   22,   78,  292,  1138 }, {   10,   23,   79,  294,  1142 },
+    {   10,   23,   80,  296,  1146 }, {   10,   24,   81,  298,  1150 }, {   11,   24,   82,  301,  1155 },
+    {   11,   25,   84,  303,  1160 }, {   12,   26,   85,  306,  1166 }, {   12,   26,   87,  309,  1172 },
+    {   13,   27,   88,  312,  1179 }, {   13,   28,   90,  316,  1186 }, {   14,   29,   92,  320,  1194 },
+    {   15,   30,   94,  325,  1203 }, {   15,   31,   97,  329,  1213 }, {   16,   33,   99,  334,  1223 },
+    {   17,   34,  102,  340,  1235 }, {   18,   36,  105,  346,  1247 }, {   20,   37,  109,  353,  1260 },
+    {   21,   39,  112,  360,  1275 }, {   22,   41,  116,  368,  1292 }, {   24,   43,  121,  377,  1309 },
+    {   25,   46,  125,  386,  1328 }, {   27,   48,  131,  397,  1349 }, {   29,   51,  136,  408,  1372 },
+    {   31,   54,  142,  420,  1397 }, {   33,   58,  149,  434,  1424 }, {   36,   61,  156,  448,  1453 },
+    {   38,   65,  164,  464,  1485 }, {   41,   70,  173,  482,  1520 }, {   45,   74,  183,  501,  1559 },
+    {   48,   79,  193,  521,  1600 }, {   52,   85,  204,  544,  1646 }, {   56,   91,  217,  569,  1696 },
+    {   61,   98,  230,  596,  1750 }, {   66,  105,  245,  625,  1809 }, {   71,  113,  261,  657,  1873 },
+    {   77,  122,  278,  692,  1944 }, {   83,  132,  297,  729,  2020 }, {   90,  142,  318,  771,  2104 },
+    {   98,  153,  341,  816,  2195 }, {  106,  166,  365,  865,  2294 }, {  116,  179,  392,  919,  2403 },
+    {  126,  194,  422,  978,  2521 }, {  136,  210,  454, 1042,  2649 }, {  148,  227,  488, 1111,  2790 },
+    {  161,  246,  526, 1187,  2943 }, {  175,  267,  568, 1270,  3110 }, {  191,  290,  613, 1360,  3292 },
+    {  207,  314,  662, 1459,  3491 }, {  225,  341,  716, 1566,  3707 }, {  245,  370,  775, 1683,  3944 },
+    {  267,  402,  839, 1811,  4201 }, {  291,  437,  909, 1950,  4482 }, {  316,  475,  985, 2102,  4788 },
+    {  345,  517, 1068, 2268,  5123 }, {  375,  562, 1158, 2448,  5487 }, {  412,  617, 1268, 2667,  5928 },
+    {  445,  665, 1364, 2860,  6317 }, {  485,  724, 1482, 3094,  6790 }, {  528,  788, 1610, 3350,  7305 },
+    {  576,  858, 1749, 3628,  7867 }, {  631,  939, 1912, 3954,  8524 }, {  687, 1022, 2078, 4285,  9192 },
+    {  748, 1113, 2259, 4647,  9920 }, {  812, 1206, 2446, 5019, 10671 }, {  884, 1313, 2661, 5448, 11537 },
+    {  964, 1431, 2895, 5917, 12482 }, { 1047, 1553, 3140, 6406, 13469 }, { 1145, 1698, 3430, 6985, 14636 },
+    { 1248, 1850, 3735, 7592, 15862 }, { 1357, 2011, 4055, 8233, 17154 }
+};
+
+/* ---------------------------------------------------------------------------
+ */
+static ALWAYS_INLINE
+bool_t isZeroCuFast(xavs2_t *h, cu_t *p_cu)
+{
+    int i_level = p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT;
+    int i_qp = cu_get_qp(h, &p_cu->cu_info);
+    int thres_satd = (int)(tab_th_zero_block_sad[i_qp][i_level] * h->param->factor_zero_block);
+
+    return p_cu->sum_satd < thres_satd;
+}
+//#endif
+
+/* ---------------------------------------------------------------------------
+ * int scrFlag = 0;             // 0=noSCR, 1=strongSCR, 2=jmSCR
+ */
+static INLINE int
+tu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
+                    int i_level, int8_t *cbp, int blockidx, coeff_t *cur_blk,
+                    int x_pu, int y_pu, int w_pu, int h_pu)
+{
+    cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
+    int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS);
+    int part_idx = PART_INDEX(w_pu, h_pu);
+    int w_tr = w_pu >> used_wavelet;
+    int h_tr = h_pu >> used_wavelet;
+    int num_non_zero = 0;
+    coeff_t *coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1));
+
+    tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr);
+
+    num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_level, w_tr, h_tr,
+                                    cu_get_qp(h, &p_cu->cu_info), 0, 1, DC_PRED);
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_fdec = p_cu->cu_info.p_rec8[0] + y_pu * FREC_STRIDE + x_pu;
+    pel8_t *p_pred = p_layer->buf_pred_inter8 + y_pu * FREC_STRIDE + x_pu;
+    if (num_non_zero != 0) {
+        *cbp |= (1 << blockidx);    // æŒ‡å®šä½è®¾ç½®ä¸º 1
+        g_funcs.pixf.copy_ss8[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr);
+
+        tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1);
+        g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr);
+
+        g_funcs.pixf.add_ps8[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu);
+    } else {
+        /* æ¸…é™¤CBPæŒ‡å®šä½çš„å€¼ï¼Œè¿™é‡ŒCBPåˆå§‹å€¼ä¸º0ï¼Œå› è€Œæ— éœ€æ“ä½œ */
+        // å…¨é›¶å—ä¸å¿…åšåå˜æ¢åé‡åŒ–ï¼Œåªéœ€æ‹·è´é¢„æµ‹å€¼ä¸ºé‡æž„å€¼
+        coeff_y[0] = 0;
+        if (p_cu->cu_info.i_tu_split) {
+            g_funcs.pixf.copy_pp8[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE);
+        }
+    }
+    } else {
+    pel10_t *p_fdec = p_cu->cu_info.p_rec10[0] + y_pu * FREC_STRIDE + x_pu;
+    pel10_t *p_pred = p_layer->buf_pred_inter10 + y_pu * FREC_STRIDE + x_pu;
+    if (num_non_zero != 0) {
+        *cbp |= (1 << blockidx);    // æŒ‡å®šä½è®¾ç½®ä¸º 1
+        g_funcs.pixf.copy_ss10[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr);
+
+        tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1);
+        g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr);
+
+        g_funcs.pixf.add_ps10[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu);
     } else {
-        /* Çå³ýCBPÖ¸¶¨Î»µÄÖµ£¬ÕâÀïCBP³õÊ¼ÖµÎª0£¬Òò¶øÎÞÐè²Ù×÷ */
-        // È«Áã¿é²»±Ø×ö·´±ä»»·´Á¿»¯£¬Ö»Ðè¿½±´Ô¤²âÖµÎªÖØ¹¹Öµ
+        /* æ¸…é™¤CBPæŒ‡å®šä½çš„å€¼ï¼Œè¿™é‡ŒCBPåˆå§‹å€¼ä¸º0ï¼Œå› è€Œæ— éœ€æ“ä½œ */
+        // å…¨é›¶å—ä¸å¿…åšåå˜æ¢åé‡åŒ–ï¼Œåªéœ€æ‹·è´é¢„æµ‹å€¼ä¸ºé‡æž„å€¼
         coeff_y[0] = 0;
         if (p_cu->cu_info.i_tu_split) {
-            g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE);
+            g_funcs.pixf.copy_pp10[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE);
+        }
+    }
+    }
+
+    return num_non_zero;
+}
+
+
+/* ---------------------------------------------------------------------------
+ * ä»¥æŒ‡å®šæ–¹å¼é‡æž„å¸§é—´é¢„æµ‹æ–¹å¼çš„CUçš„äº®åº¦åˆ†é‡ï¼›
+ * è¿”å›žå½“å‰CUåœ°å¤±çœŸï¼ˆåŠ ä¸Šè‰²åº¦å—å¤±çœŸï¼‰
+ */
+static
+dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
+                           int is_non_residual, int b_tu_split,
+                           int cbp_c, dist_t dist_chroma)
+{
+    cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
+    cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
+    coeff_t *cur_blk   = p_enc->coeff_blk;
+    coeff_t *coeff_bak = p_enc->coeff_bak;
+    coeff_t *p_resi;
+    int level = p_cu->cu_info.i_level;
+    int num_nonzero = 0;
+    int sum_dc_coeff = 0;
+    int b_zero_block = 0;
+    int blockidx;
+    int pix_x = p_cu->i_pos_x;
+    int pix_y = p_cu->i_pos_y;
+    int cu_size = p_cu->i_size;
+    int cu_size_2 = cu_size >> 1;
+    int cu_size_4 = cu_size_2 >> 1;
+    dist_t distortion;
+
+    /* clear CBP */
+    p_cu->cu_info.i_cbp = 0;
+
+    /* encode for luma */
+    cu_set_tu_split_type(h, &p_cu->cu_info, b_tu_split);
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_fenc;
+    pel8_t *p_fdec;
+    if (is_non_residual) {  /* SKIP mode (or no residual coding) */
+        int uvoffset = (FREC_CSTRIDE >> 1);
+        int part_idx_c = PART_INDEX(cu_size_2, cu_size_2);
+        int pix_x_c = pix_x >> 1;
+        int pix_y_c = pix_y >> CHROMA_V_SHIFT;
+
+        h->lcu.bypass_all_dmh |= (p_cu->cu_info.dmh_mode == 0);
+        /* copy Y component and get distortion */
+        p_fenc = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x;
+        p_fdec = p_cu->cu_info.p_rec8[0];
+        g_funcs.pixf.copy_pp8[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+        distortion = g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+
+        /* chroma distortion */
+        if (cbp_c) {
+            /* copy U component and get distortion */
+            p_fenc = h->lcu.p_fenc8[1] + pix_y_c * FENC_STRIDE + pix_x_c;
+            p_fdec = p_cu->cu_info.p_rec8[1];
+            g_funcs.pixf.copy_pp8[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter8_c, FREC_CSTRIDE);
+            distortion += g_funcs.pixf.ssd8[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+
+            /* copy V component and get distortion */
+            p_fenc = h->lcu.p_fenc8[2] + pix_y_c * FENC_STRIDE + pix_x_c;
+            p_fdec = p_cu->cu_info.p_rec8[2];
+            g_funcs.pixf.copy_pp8[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter8_c + uvoffset, FREC_CSTRIDE);
+            distortion += g_funcs.pixf.ssd8[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+        } else {
+            distortion += dist_chroma;
+        }
+
+        return distortion;
+    } else if (p_cu->cu_info.i_tu_split) {
+        int pix_cu_x = 0;
+        int pix_cu_y = 0;
+
+        switch (p_cu->cu_info.i_tu_split) {
+        case TU_SPLIT_HOR:
+            g_funcs.pixf.copy_ss8[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size);
+            for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_y += cu_size_4) {
+                p_resi = cur_blk + pix_cu_y * cu_size + pix_cu_x;
+                num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, p_resi, pix_cu_x, pix_cu_y, cu_size, cu_size_4);
+                sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]);
+            }
+            break;
+        case TU_SPLIT_VER:
+            for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_x += cu_size_4) {
+                p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x;
+                g_funcs.pixf.copy_ss8[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size);
+                num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_4, cu_size);
+                sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]);
+            }
+            break;
+        default:
+            for (blockidx = 0; blockidx < 4; blockidx++) {
+                pix_cu_x = (blockidx & 1) * cu_size_2;
+                pix_cu_y = (blockidx >> 1) * cu_size_2;
+                p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x;
+                g_funcs.pixf.copy_ss8[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size);
+                num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_2, cu_size_2);
+                sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]);
+            }
+            break;
         }
-    }
-
-    return num_non_zero;
-}
 
+        // å½“å‰CUéžé›¶ç³»æ•°ä¸å¤§äºŽ LUMA_COEFF_COST ä¸ªï¼Œä¸”DCç³»æ•°å¹¶ä¸å¤§çš„æƒ…å†µä¸‹ï¼Œå¯è®¤å®šä¸ºå…¨é›¶å—
+        b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO);
+    } else {
+        if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block) {
+            b_zero_block = 1;
+        } else {
+            num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level, &p_cu->cu_info.i_cbp, 0, coeff_bak, 0, 0, cu_size, cu_size);
 
-/* ---------------------------------------------------------------------------
- * ÒÔÖ¸¶¨·½Ê½ÖØ¹¹Ö¡¼äÔ¤²â·½Ê½µÄCUµÄÁÁ¶È·ÖÁ¿£»
- * ·µ»Øµ±Ç°CUµØÊ§Õæ£¨¼ÓÉÏÉ«¶È¿éÊ§Õæ£©
- */
-static
-dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
-                           int is_non_residual, int b_tu_split,
-                           int cbp_c, dist_t dist_chroma)
-{
-    cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
-    cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
-    coeff_t *cur_blk   = p_enc->coeff_blk;
-    coeff_t *coeff_bak = p_enc->coeff_bak;
-    coeff_t *p_resi;
-    int level = p_cu->cu_info.i_level;
-    int num_nonzero = 0;
-    int sum_dc_coeff = 0;
-    int b_zero_block = 0;
-    int blockidx;
-    int pix_x = p_cu->i_pos_x;
-    int pix_y = p_cu->i_pos_y;
-    int cu_size = p_cu->i_size;
-    int cu_size_2 = cu_size >> 1;
-    int cu_size_4 = cu_size_2 >> 1;
-    dist_t distortion;
-    pel_t *p_fenc;
-    pel_t *p_fdec;
+            // å½“å‰CUçš„æ‰€æœ‰å˜æ¢å—çš„éžé›¶ç³»æ•°æ•°é‡ï¼Œä¸å¤§äºŽ LUMA_COEFF_COST ä¸ªï¼Œä¸”DCç³»æ•°å¹¶ä¸å¤§çš„æƒ…å†µä¸‹ï¼Œå¯è®¤å®šä¸ºå…¨é›¶å—
+            sum_dc_coeff = XAVS2_ABS(p_cu->cu_info.p_coeff[0][0]);
+            b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO);
+        }
+    }
 
-    /* clear CBP */
-    p_cu->cu_info.i_cbp = 0;
+    if (b_zero_block) {
+        h->lcu.bypass_all_dmh |= (h->i_type == SLICE_TYPE_F && p_cu->cu_info.dmh_mode == 0);
+        p_cu->cu_info.i_cbp = 0;
+        g_funcs.pixf.copy_pp8[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec8[0], FREC_STRIDE,
+                p_layer->buf_pred_inter8, FREC_STRIDE);
+    }
 
-    /* encode for luma */
-    cu_set_tu_split_type(h, &p_cu->cu_info, b_tu_split);
+    /* set CBP */
+    p_cu->cu_info.i_cbp += (int8_t)cbp_c;
 
+    /* luma distortion */
+    p_fenc = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x;
+    p_fdec = p_cu->cu_info.p_rec8[0];
+    distortion = dist_chroma;
+    distortion += g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+    return distortion;
+    } else {
+    pel10_t *p_fenc;
+    pel10_t *p_fdec;
     if (is_non_residual) {  /* SKIP mode (or no residual coding) */
         int uvoffset = (FREC_CSTRIDE >> 1);
         int part_idx_c = PART_INDEX(cu_size_2, cu_size_2);
@@ -1412,24 +1949,24 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
 
         h->lcu.bypass_all_dmh |= (p_cu->cu_info.dmh_mode == 0);
         /* copy Y component and get distortion */
-        p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x;
-        p_fdec = p_cu->cu_info.p_rec[0];
-        g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
-        distortion = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+        p_fenc = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x;
+        p_fdec = p_cu->cu_info.p_rec10[0];
+        g_funcs.pixf.copy_pp10[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
+        distortion = g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
 
         /* chroma distortion */
         if (cbp_c) {
             /* copy U component and get distortion */
-            p_fenc = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c;
-            p_fdec = p_cu->cu_info.p_rec[1];
-            g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c, FREC_CSTRIDE);
-            distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+            p_fenc = h->lcu.p_fenc10[1] + pix_y_c * FENC_STRIDE + pix_x_c;
+            p_fdec = p_cu->cu_info.p_rec10[1];
+            g_funcs.pixf.copy_pp10[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter10_c, FREC_CSTRIDE);
+            distortion += g_funcs.pixf.ssd10[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
 
             /* copy V component and get distortion */
-            p_fenc = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c;
-            p_fdec = p_cu->cu_info.p_rec[2];
-            g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c + uvoffset, FREC_CSTRIDE);
-            distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
+            p_fenc = h->lcu.p_fenc10[2] + pix_y_c * FENC_STRIDE + pix_x_c;
+            p_fdec = p_cu->cu_info.p_rec10[2];
+            g_funcs.pixf.copy_pp10[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter10_c + uvoffset, FREC_CSTRIDE);
+            distortion += g_funcs.pixf.ssd10[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2);
         } else {
             distortion += dist_chroma;
         }
@@ -1441,7 +1978,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
 
         switch (p_cu->cu_info.i_tu_split) {
         case TU_SPLIT_HOR:
-            g_funcs.pixf.copy_ss[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size);
+            g_funcs.pixf.copy_ss10[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size);
             for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_y += cu_size_4) {
                 p_resi = cur_blk + pix_cu_y * cu_size + pix_cu_x;
                 num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, p_resi, pix_cu_x, pix_cu_y, cu_size, cu_size_4);
@@ -1451,7 +1988,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
         case TU_SPLIT_VER:
             for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_x += cu_size_4) {
                 p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x;
-                g_funcs.pixf.copy_ss[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size);
+                g_funcs.pixf.copy_ss10[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size);
                 num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_4, cu_size);
                 sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]);
             }
@@ -1461,14 +1998,14 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
                 pix_cu_x = (blockidx & 1) * cu_size_2;
                 pix_cu_y = (blockidx >> 1) * cu_size_2;
                 p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x;
-                g_funcs.pixf.copy_ss[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size);
+                g_funcs.pixf.copy_ss10[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size);
                 num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_2, cu_size_2);
                 sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]);
             }
             break;
         }
 
-        // µ±Ç°CU·ÇÁãÏµÊý²»´óÓÚ LUMA_COEFF_COST ¸ö£¬ÇÒDCÏµÊý²¢²»´óµÄÇé¿öÏÂ£¬¿ÉÈÏ¶¨ÎªÈ«Áã¿é
+        // å½“å‰CUéžé›¶ç³»æ•°ä¸å¤§äºŽ LUMA_COEFF_COST ä¸ªï¼Œä¸”DCç³»æ•°å¹¶ä¸å¤§çš„æƒ…å†µä¸‹ï¼Œå¯è®¤å®šä¸ºå…¨é›¶å—
         b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO);
     } else {
         if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block) {
@@ -1476,7 +2013,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
         } else {
             num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level, &p_cu->cu_info.i_cbp, 0, coeff_bak, 0, 0, cu_size, cu_size);
 
-            // µ±Ç°CUµÄËùÓÐ±ä»»¿éµÄ·ÇÁãÏµÊýÊýÁ¿£¬²»´óÓÚ LUMA_COEFF_COST ¸ö£¬ÇÒDCÏµÊý²¢²»´óµÄÇé¿öÏÂ£¬¿ÉÈÏ¶¨ÎªÈ«Áã¿é
+            // å½“å‰CUçš„æ‰€æœ‰å˜æ¢å—çš„éžé›¶ç³»æ•°æ•°é‡ï¼Œä¸å¤§äºŽ LUMA_COEFF_COST ä¸ªï¼Œä¸”DCç³»æ•°å¹¶ä¸å¤§çš„æƒ…å†µä¸‹ï¼Œå¯è®¤å®šä¸ºå…¨é›¶å—
             sum_dc_coeff = XAVS2_ABS(p_cu->cu_info.p_coeff[0][0]);
             b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO);
         }
@@ -1485,19 +2022,20 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
     if (b_zero_block) {
         h->lcu.bypass_all_dmh |= (h->i_type == SLICE_TYPE_F && p_cu->cu_info.dmh_mode == 0);
         p_cu->cu_info.i_cbp = 0;
-        g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec[0], FREC_STRIDE,
-                p_layer->buf_pred_inter, FREC_STRIDE);
+        g_funcs.pixf.copy_pp10[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec10[0], FREC_STRIDE,
+                p_layer->buf_pred_inter10, FREC_STRIDE);
     }
 
     /* set CBP */
     p_cu->cu_info.i_cbp += (int8_t)cbp_c;
 
     /* luma distortion */
-    p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x;
-    p_fdec = p_cu->cu_info.p_rec[0];
+    p_fenc = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x;
+    p_fdec = p_cu->cu_info.p_rec10[0];
     distortion = dist_chroma;
-    distortion += g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
+    distortion += g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE);
     return distortion;
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1584,7 +2122,7 @@ static int tu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu,
 }
 
 /* ---------------------------------------------------------------------------
- * »ñÈ¡ÁÁ¶È¡¢É«¶È·ÖÁ¿µÄÔ¤²âÏñËØÖµ£¬·µ»ØMVÊÇ·ñÔÚÓÐÐ§·¶Î§ÄÚ
+ * èŽ·å–äº®åº¦ã€è‰²åº¦åˆ†é‡çš„é¢„æµ‹åƒç´ å€¼ï¼Œè¿”å›žMVæ˜¯å¦åœ¨æœ‰æ•ˆèŒƒå›´å†…
  */
 static ALWAYS_INLINE
 int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma)
@@ -1603,16 +2141,14 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma)
         int pix_x   = p_cu->i_pix_x + start_x;
         int pix_y   = p_cu->i_pix_y + start_y;
 
-        mv_t mv_1st, mv_2nd;   // µÚÒ»£¨Ç°Ïò»òÕßBÖ¡µ¥ÏòÔ¤²â£©ºÍµÚ¶þ£¨ºóÏò£©ÔË¶¯Ê¸Á¿
-        int ref_1st, ref_2nd;  // µÚÒ»£¨Ç°Ïò»òÕßBÖ¡µ¥ÏòÔ¤²â£©ºÍµÚ¶þ£¨ºóÏò£©²Î¿¼Ö¡ºÅ
+        mv_t mv_1st, mv_2nd;   // ç¬¬ä¸€ï¼ˆå‰å‘æˆ–è€…Bå¸§å•å‘é¢„æµ‹ï¼‰å’Œç¬¬äºŒï¼ˆåŽå‘ï¼‰è¿åŠ¨çŸ¢é‡
+        int ref_1st, ref_2nd;  // ç¬¬ä¸€ï¼ˆå‰å‘æˆ–è€…Bå¸§å•å‘é¢„æµ‹ï¼‰å’Œç¬¬äºŒï¼ˆåŽå‘ï¼‰å‚è€ƒå¸§å·
         int num_mvs;
-        int b_mv_valid;        // MVÊÇ·ñÓÐÐ§£º´óÐ¡È¡ÖµÊÇ·ñÔÚ±ê×¼¹æ¶¨µÄÓÐÐ§·¶Î§ÄÚ
-        pel_t *p_temp = p_enc->buf_pixel_temp;
-        pel_t *p_pred;
+        int b_mv_valid;        // MVæ˜¯å¦æœ‰æ•ˆï¼šå¤§å°å–å€¼æ˜¯å¦åœ¨æ ‡å‡†è§„å®šçš„æœ‰æ•ˆèŒƒå›´å†…
         xavs2_frame_t *p_ref1 = NULL;
         xavs2_frame_t *p_ref2 = NULL;
 
-        /* MVµÄÊýÁ¿£¬´óÓÚ1ÎªË«²Î¿¼Ö¡/DMHµÄÔ¤²â */
+        /* MVçš„æ•°é‡ï¼Œå¤§äºŽ1ä¸ºåŒå‚è€ƒå¸§/DMHçš„é¢„æµ‹ */
         num_mvs = cu_get_mvs_for_mc(h, p_cu, blockidx, &mv_1st, &mv_2nd, &ref_1st, &ref_2nd);
         b_mv_valid = check_mv_range(h, &mv_1st, ref_1st, pix_x, pix_y, width, height);
         if (num_mvs > 1) {
@@ -1628,13 +2164,58 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma)
         }
 
         /* y component */
+        if (h->param->input_sample_bit_depth == 8) {
+        pel8_t *p_temp = p_enc->buf_pixel_temp8;
+        pel8_t *p_pred;
+        if (cal_luma_chroma & 1) {
+            p_pred = p_layer->buf_pred_inter8 + start_y * FREC_STRIDE + start_x;
+
+            mc_luma8(h, p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1);
+            if (num_mvs > 1) {
+                mc_luma8(h, p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2);
+                g_funcs.pixf.avg8[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32);
+            }
+        }
+
+        /* u and v component */
+        if (h->param->chroma_format == CHROMA_420 && (cal_luma_chroma & 2)) {
+            int uvoffset = (FREC_CSTRIDE >> 1);
+            start_x >>= 1;
+            width   >>= 1;
+            pix_x   >>= 1;
+            start_y >>= CHROMA_V_SHIFT;
+            pix_y   >>= CHROMA_V_SHIFT;
+            height  >>= CHROMA_V_SHIFT;
+
+            p_pred = p_enc->buf_pred_inter8_c + start_y * FREC_CSTRIDE + start_x;
+
+            /* u component */
+            mc_chroma8(h, p_pred, p_pred + uvoffset, FREC_CSTRIDE,
+                      mv_1st.x, mv_1st.y, width, height, p_ref1);
+
+            if (num_mvs > 1) {
+                mc_chroma8(h, p_temp, p_temp + uvoffset, FREC_CSTRIDE,
+                          mv_2nd.x, mv_2nd.y, width, height, p_ref2);
+
+                if (width != 2 && width != 6 && height != 2 && height != 6) {
+                    pixel_avg_pp8_t func_avg = g_funcs.pixf.avg8[PART_INDEX(width, height)];
+                    func_avg(p_pred           , FREC_CSTRIDE, p_pred           , FREC_CSTRIDE, p_temp           , FREC_CSTRIDE, 32);
+                    func_avg(p_pred + uvoffset, FREC_CSTRIDE, p_pred + uvoffset, FREC_CSTRIDE, p_temp + uvoffset, FREC_CSTRIDE, 32);
+                } else {
+                    g_funcs.pixf.average8(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2);
+                }
+            }
+        }
+        } else {
+        pel10_t *p_temp = p_enc->buf_pixel_temp10;
+        pel10_t *p_pred;
         if (cal_luma_chroma & 1) {
-            p_pred = p_layer->buf_pred_inter + start_y * FREC_STRIDE + start_x;
+            p_pred = p_layer->buf_pred_inter10 + start_y * FREC_STRIDE + start_x;
 
-            mc_luma(p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1);
+            mc_luma10(h, p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1);
             if (num_mvs > 1) {
-                mc_luma(p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2);
-                g_funcs.pixf.avg[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32);
+                mc_luma10(h, p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2);
+                g_funcs.pixf.avg10[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32);
             }
         }
 
@@ -1648,25 +2229,26 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma)
             pix_y   >>= CHROMA_V_SHIFT;
             height  >>= CHROMA_V_SHIFT;
 
-            p_pred = p_enc->buf_pred_inter_c + start_y * FREC_CSTRIDE + start_x;
+            p_pred = p_enc->buf_pred_inter10_c + start_y * FREC_CSTRIDE + start_x;
 
             /* u component */
-            mc_chroma(p_pred, p_pred + uvoffset, FREC_CSTRIDE,
+            mc_chroma10(h, p_pred, p_pred + uvoffset, FREC_CSTRIDE,
                       mv_1st.x, mv_1st.y, width, height, p_ref1);
 
             if (num_mvs > 1) {
-                mc_chroma(p_temp, p_temp + uvoffset, FREC_CSTRIDE,
+                mc_chroma10(h, p_temp, p_temp + uvoffset, FREC_CSTRIDE,
                           mv_2nd.x, mv_2nd.y, width, height, p_ref2);
 
                 if (width != 2 && width != 6 && height != 2 && height != 6) {
-                    pixel_avg_pp_t func_avg = g_funcs.pixf.avg[PART_INDEX(width, height)];
+                    pixel_avg_pp10_t func_avg = g_funcs.pixf.avg10[PART_INDEX(width, height)];
                     func_avg(p_pred           , FREC_CSTRIDE, p_pred           , FREC_CSTRIDE, p_temp           , FREC_CSTRIDE, 32);
                     func_avg(p_pred + uvoffset, FREC_CSTRIDE, p_pred + uvoffset, FREC_CSTRIDE, p_temp + uvoffset, FREC_CSTRIDE, 32);
                 } else {
-                    g_funcs.pixf.average(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2);
+                    g_funcs.pixf.average10(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2);
                 }
             }
         }
+        }
     }
 
     return 1;
@@ -1699,9 +2281,8 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
     dist_t dist_split    = 0;
     dist_t dist_notsplit = 0;
     dist_t best_dist_cur = 0;
-    rdcost_t rdcost = *min_rdcost;   // ³õÊ¼»¯Îª×î´ó¿ÉÔÊÐíµÄRDCost
+    rdcost_t rdcost = *min_rdcost;   // åˆå§‹åŒ–ä¸ºæœ€å¤§å¯å…è®¸çš„RDCost
     rdcost_t rdcost_split = rdcost;
-    pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     cu_layer_t *p_layer  = cu_get_layer(h, p_cu->cu_info.i_level);
     cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level);
 
@@ -1746,15 +2327,187 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
         }
     }
 
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
+    /* 3.2, check luma CU tu-split type and CBP */
+    /* 3.2.1, get luma residual */
+    g_funcs.pixf.sub_ps8[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size,
+            p_fenc, p_layer->buf_pred_inter8,
+            FENC_STRIDE, FREC_STRIDE);
+
+    /* 3.2.2, Fast algorithm, check whether TU split is essential */
+    if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) || IS_ALG_ENABLE(OPT_ECU)) {
+        p_cu->sum_satd = g_funcs.pixf.sad8[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter8, FREC_STRIDE, p_fenc, FENC_STRIDE);
+        p_cu->is_zero_block = isZeroCuFast(h, p_cu);
+    }
+
+    /* only get cost with tu depth equals 1 */
+    if ((h->enable_tu_2level == 1) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split != 0))) {
+        if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) {
+            b_try_tu_split = FALSE;
+        }
+
+        if (b_try_tu_split) {
+            h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */
+
+            dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma);
+            tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split);
+
+            /* store dct coefficients, rec data and coding state for tu depth = 1*/
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+        } else {
+            rdcost_split = MAX_COST;
+            tmp_cbp = 0;
+        }
+        if (rdcost_split >= *min_rdcost) {
+            h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
+            return 0;  /* return code = 0, means it is not the best mode */
+        } else {
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+            p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost_split, p_layer->mode_rdcost[mode]);
+            /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/
+            p_cu->cu_info.i_cbp = (int8_t)tmp_cbp;
+            *min_rdcost = rdcost_split;
+            p_cu->best_dist_total = dist_split;
+            h->copy_aec_state_rdo(&p_layer->cs_cu, &p_enc->cs_tu);
+            h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
+            cu_store_parameters(h, p_cu, p_best);
+            return 1;  /* return code = 1, means it is the best mode */
+        }
+    } else if ((h->enable_tu_2level == 0) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split == 0))) {   /* only get cost with tu depth equals 0 */
+        dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma);
+        tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost);
+    } else {
+        if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) {
+            b_try_tu_split = FALSE;
+        }
+
+        if (b_try_tu_split) {
+            h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */
+
+            dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma);
+            tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split);
+
+            /* store dct coefficients, rec data and coding state for tu depth = 1*/
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+        } else {
+            rdcost_split = MAX_COST;
+            tmp_cbp = 0;
+        }
+
+        /* 3.2.4, get cost with tu depth equals 0 */
+        if (b_try_tu_nonsplit) {
+            dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma);
+            tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost);
+        }
+
+        /* 3.2.5, choose the best tu depth (whether split or not) */
+        if (rdcost > rdcost_split) {
+            /* the best tu depth is 1 */
+            rdcost = rdcost_split;
+            best_dist_cur = dist_split;
+            cu_set_tu_split_type(h, &p_cu->cu_info, 1);
+
+            /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/
+            p_cu->cu_info.i_cbp = (int8_t)tmp_cbp;
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
+
+            h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */
+        } else {
+            best_dist_cur = dist_notsplit;
+        }
+    }
+
+    if (IS_ALG_ENABLE(OPT_CBP_DIRECT) && IS_SKIP_MODE(mode)) {
+        /* Skip/Directæ¨¡å¼çš„æ®‹å·®ç»è¿‡å˜æ¢é‡åŒ–åŽä¸ºå…¨é›¶å—ï¼š
+         * æ­¤æ—¶ç»ˆæ­¢ä¸‹å±‚CUåˆ’åˆ†å¯ä»¥å¾—åˆ°è¾ƒå¤šæ—¶é—´èŠ‚çœä¸”æŸå¤±è¾ƒå°ï¼Œ
+         * ä½†è·³è¿‡æ™®é€šPUåˆ’åˆ†æ¨¡å¼å¹¶ä¸èƒ½å¸¦æ¥æ›´å¤šçš„åŠ é€Ÿã€‚
+         */
+        p_cu->b_cbp_direct = (p_cu->cu_info.i_cbp == 0);
+    }
+
+    /* 3.3, check skip mode for PRED_SKIP when CBP is nonzero */
+    if (IS_SKIP_MODE(p_cu->cu_info.i_mode) && p_cu->cu_info.i_cbp != 0) {
+        rdcost_t rdcost_skip = MAX_COST;
+        dist_t dist_total_skip;
+        int best_tu_split_type = p_cu->cu_info.i_tu_split;
+
+        if (best_tu_split_type == TU_SPLIT_NON) {
+            h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for best Direct mode */
+        }
+
+        h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);/* restore coding state */
+
+        tmp_cbp = p_cu->cu_info.i_cbp;
+        /* backup reconstruction buffers, prepare for SKIP mode */
+        XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+        if (cbp_c != 0) {
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[1], p_layer->p_rec8_tmp[1]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[2], p_layer->p_rec8_tmp[2]);
+        }
+
+        /* check SKIP Mode */
+        dist_total_skip = cu_recon_inter_luma(h, p_aec, p_cu, 1, 0, cbp_c, dist_chroma);
+        tu_rdcost_inter(h, p_aec, p_cu, dist_total_skip, rate_chroma, &rdcost_skip);
+
+        if (rdcost_skip <= rdcost) {
+            rdcost = rdcost_skip;    /* skip mode is the best */
+            best_dist_cur = dist_total_skip;
+            p_cu->cu_info.i_tu_split = TU_SPLIT_NON;
+        } else {
+            h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */
+            /* revert buffers */
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]);
+            if (cbp_c != 0) {
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[1], p_layer->p_rec8_tmp[1]);
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[2], p_layer->p_rec8_tmp[2]);
+            }
+
+            p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp);
+            p_cu->cu_info.i_tu_split = (int8_t)(best_tu_split_type);
+        }
+    }
+
+    /* -------------------------------------------------------------
+     * 4, store the min cost for current cu mode
+     */
+    p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost, p_layer->mode_rdcost[mode]);
+
+    /* -------------------------------------------------------------
+     * 5, update the min cost, restore the coding state and return
+     */
+    if (rdcost >= *min_rdcost) {
+        h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
+        return 0;  /* return code = 0, means it is not the best mode */
+    } else {
+        if (mode == PRED_SKIP && IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) {
+            /* re-cover best skip prediction data */
+            XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+        }
+        *min_rdcost = rdcost;
+        p_cu->best_dist_total = best_dist_cur;
+        /* store coding state for the best mode */
+        h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec);
+        h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
+        /* update best CU information */
+        cu_store_parameters(h, p_cu, p_best);
+        return 1;  /* return code = 1, means it is the best mode */
+    }
+    } else {
+    pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     /* 3.2, check luma CU tu-split type and CBP */
     /* 3.2.1, get luma residual */
-    g_funcs.pixf.sub_ps[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size,
-            p_fenc, p_layer->buf_pred_inter,
+    g_funcs.pixf.sub_ps10[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size,
+            p_fenc, p_layer->buf_pred_inter10,
             FENC_STRIDE, FREC_STRIDE);
 
     /* 3.2.2, Fast algorithm, check whether TU split is essential */
     if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) || IS_ALG_ENABLE(OPT_ECU)) {
-        p_cu->sum_satd = g_funcs.pixf.sad[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc, FENC_STRIDE);
+        p_cu->sum_satd = g_funcs.pixf.sad10[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter10, FREC_STRIDE, p_fenc, FENC_STRIDE);
         p_cu->is_zero_block = isZeroCuFast(h, p_cu);
     }
 
@@ -1771,7 +2524,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
             tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split);
 
             /* store dct coefficients, rec data and coding state for tu depth = 1*/
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
             XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
         } else {
             rdcost_split = MAX_COST;
@@ -1781,7 +2534,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
             h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);
             return 0;  /* return code = 0, means it is not the best mode */
         } else {
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
             XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
             p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost_split, p_layer->mode_rdcost[mode]);
             /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/
@@ -1808,7 +2561,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
             tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split);
 
             /* store dct coefficients, rec data and coding state for tu depth = 1*/
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
             XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
         } else {
             rdcost_split = MAX_COST;
@@ -1830,7 +2583,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
 
             /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/
             p_cu->cu_info.i_cbp = (int8_t)tmp_cbp;
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
             XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]);
 
             h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */
@@ -1840,9 +2593,9 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
     }
 
     if (IS_ALG_ENABLE(OPT_CBP_DIRECT) && IS_SKIP_MODE(mode)) {
-        /* Skip/DirectÄ£Ê½µÄ²Ð²î¾­¹ý±ä»»Á¿»¯ºóÎªÈ«Áã¿é£º
-         * ´ËÊ±ÖÕÖ¹ÏÂ²ãCU»®·Ö¿ÉÒÔµÃµ½½Ï¶àÊ±¼ä½ÚÊ¡ÇÒËðÊ§½ÏÐ¡£¬
-         * µ«Ìø¹ýÆÕÍ¨PU»®·ÖÄ£Ê½²¢²»ÄÜ´øÀ´¸ü¶àµÄ¼ÓËÙ¡£
+        /* Skip/Directæ¨¡å¼çš„æ®‹å·®ç»è¿‡å˜æ¢é‡åŒ–åŽä¸ºå…¨é›¶å—ï¼š
+         * æ­¤æ—¶ç»ˆæ­¢ä¸‹å±‚CUåˆ’åˆ†å¯ä»¥å¾—åˆ°è¾ƒå¤šæ—¶é—´èŠ‚çœä¸”æŸå¤±è¾ƒå°ï¼Œ
+         * ä½†è·³è¿‡æ™®é€šPUåˆ’åˆ†æ¨¡å¼å¹¶ä¸èƒ½å¸¦æ¥æ›´å¤šçš„åŠ é€Ÿã€‚
          */
         p_cu->b_cbp_direct = (p_cu->cu_info.i_cbp == 0);
     }
@@ -1861,10 +2614,10 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
 
         tmp_cbp = p_cu->cu_info.i_cbp;
         /* backup reconstruction buffers, prepare for SKIP mode */
-        XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+        XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
         if (cbp_c != 0) {
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]);
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[1], p_layer->p_rec10_tmp[1]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[2], p_layer->p_rec10_tmp[2]);
         }
 
         /* check SKIP Mode */
@@ -1878,10 +2631,10 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
         } else {
             h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */
             /* revert buffers */
-            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]);
+            XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]);
             if (cbp_c != 0) {
-                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]);
-                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]);
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[1], p_layer->p_rec10_tmp[1]);
+                XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[2], p_layer->p_rec10_tmp[2]);
             }
 
             p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp);
@@ -1903,7 +2656,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
     } else {
         if (mode == PRED_SKIP && IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) {
             /* re-cover best skip prediction data */
-            XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+            XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
         }
         *min_rdcost = rdcost;
         p_cu->best_dist_total = best_dist_cur;
@@ -1914,6 +2667,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost,
         cu_store_parameters(h, p_cu, p_best);
         return 1;  /* return code = 1, means it is the best mode */
     }
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1970,7 +2724,7 @@ rdcost_t cu_rdo_motion_estimation(xavs2_t *h, cu_t *p_cu, xavs2_me_t *p_me, int
         p_cb = &p_cu->cu_info.cb[block];
         cu_get_neighbors(h, p_cu, p_cb);
 
-        /* µÚÒ»¸öPU²»ÐèÒªÖØÐÂ½øÐÐME£¨MVP²»±ä£© */
+        /* ç¬¬ä¸€ä¸ªPUä¸éœ€è¦é‡æ–°è¿›è¡ŒMEï¼ˆMVPä¸å˜ï¼‰ */
         if (dualpred_enabled < 0 && block == 0) {
             best_fwd_ref = p_mode->ref_idx_single[0];
         } else {
@@ -2064,14 +2818,14 @@ rdcost_t cu_rdo_motion_estimation(xavs2_t *h, cu_t *p_cu, xavs2_me_t *p_me, int
         p_cu->cu_info.b8pdir[block] = (int8_t)best_pdir;
     }
 
-    cu_get_mvds(h, p_cu);  // Éú³ÉMVD
+    cu_get_mvds(h, p_cu);  // ç”ŸæˆMVD
 
-    return total_cost;  // ·µ»Ø×îÐ¡Cost
+    return total_cost;  // è¿”å›žæœ€å°Cost
 }
 
 //#if OPT_DMH_CANDIDATE
 /* ---------------------------------------------------------------------------
- * ÌáÇ°»ñÈ¡×îÓÅµÄDMHÄ£Ê½ºòÑ¡£¬¼õÉÙRDO´ÎÊý
+ * æå‰èŽ·å–æœ€ä¼˜çš„DMHæ¨¡å¼å€™é€‰ï¼Œå‡å°‘RDOæ¬¡æ•°
  */
 static int dmh_bits[9] = {
 //  0, 3, 3, 4, 4, 5, 5, 5, 5
@@ -2082,22 +2836,46 @@ static int rdo_get_dmh_candidate(xavs2_t *h, cu_t *p_cu, rdcost_t rdcost_non_dmh
 {
     const int num_dmh_modes = DMH_MODE_NUM + DMH_MODE_NUM - 1;
     int cu_size = 1 << p_cu->cu_info.i_level;
-    pixel_ssd_t cmp_dmh = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)];
     rdcost_t min_distotion = MAX_COST;
     dist_t distortion;
     rdcost_t cost;
     int best_dmh_cand = -1;
     cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
-    pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     int i;
     int rate;
-    /* ±éÀúDMHÄ£Ê½Ö´ÐÐÔ¤²â²¢¼ÆËãÊ§Õæ£¬È¡Ê§Õæ×îÐ¡µÄÒ»¸öÄ£Ê½×÷ÎªDMHºòÑ¡¼¯ */
+    /* éåŽ†DMHæ¨¡å¼æ‰§è¡Œé¢„æµ‹å¹¶è®¡ç®—å¤±çœŸï¼Œå–å¤±çœŸæœ€å°çš„ä¸€ä¸ªæ¨¡å¼ä½œä¸ºDMHå€™é€‰é›† */
+    if (h->param->input_sample_bit_depth == 8) {
+    pixel8_ssd_t cmp_dmh = g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)];
+    pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
+    for (i = 1; i < num_dmh_modes; i++) {
+        /* get prediction data and luma distortion */
+        p_cu->cu_info.dmh_mode = (int8_t)(i);
+        if (rdo_get_pred_inter(h, p_cu, 1)) {
+            rate = dmh_bits[i];
+            distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+            cost = distortion + h->f_lambda_mode * rate;
+            if (cost < min_distotion) {
+                min_distotion = cost;
+                best_dmh_cand = i;
+            }
+        }
+    }
+
+    if (IS_ALG_ENABLE(OPT_SKIP_DMH_THRES) && min_distotion > (rdcost_t)(1.2 * rdcost_non_dmh)) {
+        /* ä¸è€ƒè™‘æ®‹å·®ç¼–ç å¸¦æ¥çš„distortionå‡å°‘ */
+        return -1;
+    } else {
+        return best_dmh_cand;
+    }
+    } else {
+    pixel10_ssd_t cmp_dmh = g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)];
+    pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     for (i = 1; i < num_dmh_modes; i++) {
         /* get prediction data and luma distortion */
         p_cu->cu_info.dmh_mode = (int8_t)(i);
         if (rdo_get_pred_inter(h, p_cu, 1)) {
             rate = dmh_bits[i];
-            distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+            distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
             cost = distortion + h->f_lambda_mode * rate;
             if (cost < min_distotion) {
                 min_distotion = cost;
@@ -2107,17 +2885,18 @@ static int rdo_get_dmh_candidate(xavs2_t *h, cu_t *p_cu, rdcost_t rdcost_non_dmh
     }
 
     if (IS_ALG_ENABLE(OPT_SKIP_DMH_THRES) && min_distotion > (rdcost_t)(1.2 * rdcost_non_dmh)) {
-        /* ²»¿¼ÂÇ²Ð²î±àÂë´øÀ´µÄdistortion¼õÉÙ */
+        /* ä¸è€ƒè™‘æ®‹å·®ç¼–ç å¸¦æ¥çš„distortionå‡å°‘ */
         return -1;
     } else {
         return best_dmh_cand;
     }
+    }
 }
 //#endif
 
 
 /* ---------------------------------------------------------------------------
- * ³¢ÊÔËùÓÐÖ¡¼äÔ¤²â¿é»®·Ö·½Ê½£¬Ñ¡ÔñÒ»¸ö×îÓÅµÄ»®·Ö
+ * å°è¯•æ‰€æœ‰å¸§é—´é¢„æµ‹å—åˆ’åˆ†æ–¹å¼ï¼Œé€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„åˆ’åˆ†
  */
 static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32_t inter_modes,
                                      cu_info_t *best, rdcost_t *p_min_rdcost,
@@ -2133,7 +2912,6 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32
     int pix_y = p_cu->i_pix_y;
     int pix_x_c = pix_x >> 1;
     int pix_y_c = pix_y >> CHROMA_V_SHIFT;
-    pel_t *p_fenc[3];
     int i;
     int64_t min_cost = MAX_COST;
     int64_t mecost;
@@ -2145,14 +2923,71 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32
 
     //inter_modes |= (uint32_t)((1 << PRED_2NxN) | (1 << PRED_Nx2N));
 
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_fenc[3];
+    for (mode = 1; mode < MAX_INTER_MODES; mode++) {
+        /* æ‰§è¡Œè¿åŠ¨ä¼°è®¡ */
+
+        if (!(inter_modes & (1 << mode))) {
+            continue;           // ç›´æŽ¥è·³è¿‡ä¸å¯ç”¨æ¨¡å¼çš„å†³ç­–
+        }
+
+        /* å¿«é€Ÿå†³ç­–(OPT_BYPASS_AMP)ï¼šå¦‚æžœP2NxNæœªèŽ·å¾—æœ€ä¼˜ï¼Œç›´æŽ¥è·³è¿‡ç›¸åŒåˆ’åˆ†æ–¹å‘çš„PRED_2NxnU/PRED_2NxnD; PNx2NåŒç† */
+        if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) {
+            if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best_cu_mode != PRED_2NxN) {
+                continue;
+            } else if ((mode == PRED_nLx2N || mode == PRED_nRx2N) && best_cu_mode != PRED_Nx2N) {
+                continue;
+            }
+        }
+
+        p_cu->cu_info.i_mode = (int8_t)mode;
+        cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode);
+        cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled);
+
+        /* ä¼°è®¡Costé€‰å–æœ€å°çš„ */
+        p_cu->cu_info.directskip_wsm_idx = 0;
+        p_cu->cu_info.directskip_mhp_idx = DS_NONE;
+        p_cu->cu_info.dmh_mode = 0;
+
+        rdo_get_pred_inter(h, p_cu, 3);
+        p_fenc[0] = h->lcu.p_fenc8[0] + pix_y   * FENC_STRIDE + pix_x;
+        p_fenc[1] = h->lcu.p_fenc8[1] + pix_y_c * FENC_STRIDE + pix_x_c;
+        p_fenc[2] = h->lcu.p_fenc8[2] + pix_y_c * FENC_STRIDE + pix_x_c;
+
+        mecost  = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter8, FREC_STRIDE, p_fenc[0], FENC_STRIDE);
+        mecost += g_funcs.pixf.sa8d8[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter8_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE);
+        mecost += g_funcs.pixf.sa8d8[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter8_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE);
+
+        for (i = 0; i < p_cu->cu_info.num_pu; i++) {
+            mecost += p_cu->mvcost[i];
+            ref1 = p_cu->cu_info.ref_idx_1st[i];
+            ref2= p_cu->cu_info.ref_idx_2nd[i];
+            if (h->i_type != SLICE_TYPE_B) {
+                mecost += (ref1 == INVALID_REF? 0: REF_COST(ref1));
+                mecost += (ref2 == INVALID_REF? 0: REF_COST(ref2));
+            }
+        }
+
+        if (mecost < min_cost) {
+            memcpy(&p_layer->cu_mode.best_mc_tmp, &p_cu->mc, sizeof(p_cu->mc));
+            memcpy(best, &p_cu->cu_info, sizeof(cu_info_t));
+            min_cost     = mecost;
+            best_cu_mode = mode;
+        }
+    }
+
+    return best_cu_mode;
+    } else {
+    pel10_t *p_fenc[3];
     for (mode = 1; mode < MAX_INTER_MODES; mode++) {
-        /* Ö´ÐÐÔË¶¯¹À¼Æ */
+        /* æ‰§è¡Œè¿åŠ¨ä¼°è®¡ */
 
         if (!(inter_modes & (1 << mode))) {
-            continue;           // Ö±½ÓÌø¹ý²»¿ÉÓÃÄ£Ê½µÄ¾ö²ß
+            continue;           // ç›´æŽ¥è·³è¿‡ä¸å¯ç”¨æ¨¡å¼çš„å†³ç­–
         }
 
-        /* ¿ìËÙ¾ö²ß(OPT_BYPASS_AMP)£ºÈç¹ûP2NxNÎ´»ñµÃ×îÓÅ£¬Ö±½ÓÌø¹ýÏàÍ¬»®·Ö·½ÏòµÄPRED_2NxnU/PRED_2NxnD; PNx2NÍ¬Àí */
+        /* å¿«é€Ÿå†³ç­–(OPT_BYPASS_AMP)ï¼šå¦‚æžœP2NxNæœªèŽ·å¾—æœ€ä¼˜ï¼Œç›´æŽ¥è·³è¿‡ç›¸åŒåˆ’åˆ†æ–¹å‘çš„PRED_2NxnU/PRED_2NxnD; PNx2NåŒç† */
         if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) {
             if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best_cu_mode != PRED_2NxN) {
                 continue;
@@ -2165,19 +3000,19 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32
         cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode);
         cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled);
 
-        /* ¹À¼ÆCostÑ¡È¡×îÐ¡µÄ */
+        /* ä¼°è®¡Costé€‰å–æœ€å°çš„ */
         p_cu->cu_info.directskip_wsm_idx = 0;
         p_cu->cu_info.directskip_mhp_idx = DS_NONE;
         p_cu->cu_info.dmh_mode = 0;
 
         rdo_get_pred_inter(h, p_cu, 3);
-        p_fenc[0] = h->lcu.p_fenc[0] + pix_y   * FENC_STRIDE + pix_x;
-        p_fenc[1] = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c;
-        p_fenc[2] = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c;
+        p_fenc[0] = h->lcu.p_fenc10[0] + pix_y   * FENC_STRIDE + pix_x;
+        p_fenc[1] = h->lcu.p_fenc10[1] + pix_y_c * FENC_STRIDE + pix_x_c;
+        p_fenc[2] = h->lcu.p_fenc10[2] + pix_y_c * FENC_STRIDE + pix_x_c;
 
-        mecost  = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc[0], FENC_STRIDE);
-        mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE);
-        mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE);
+        mecost  = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter10, FREC_STRIDE, p_fenc[0], FENC_STRIDE);
+        mecost += g_funcs.pixf.sa8d10[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter10_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE);
+        mecost += g_funcs.pixf.sa8d10[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter10_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE);
 
         for (i = 0; i < p_cu->cu_info.num_pu; i++) {
             mecost += p_cu->mvcost[i];
@@ -2198,10 +3033,11 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32
     }
 
     return best_cu_mode;
+    }
 }
 
 /* ---------------------------------------------------------------------------
- * ³¢ÊÔÆÕÍ¨Ö¡¼äÔ¤²â¿é»®·Ö·½Ê½£¬²¢¼ÆËãÏàÓ¦µÄCost
+ * å°è¯•æ™®é€šå¸§é—´é¢„æµ‹å—åˆ’åˆ†æ–¹å¼ï¼Œå¹¶è®¡ç®—ç›¸åº”çš„Cost
  */
 static
 void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, int i_level,
@@ -2216,13 +3052,13 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in
 
     h->lcu.bypass_all_dmh = 0;
 
-    /* ¼ÆËãÒ»¸öÖ¡¼ä»®·ÖÄ£Ê½µÄRDCost£¬ÒÔÈ·¶¨×îÓÅ±àÂëÄ£Ê½ */
+    /* è®¡ç®—ä¸€ä¸ªå¸§é—´åˆ’åˆ†æ¨¡å¼çš„RDCostï¼Œä»¥ç¡®å®šæœ€ä¼˜ç¼–ç æ¨¡å¼ */
     p_cu->cu_info.directskip_wsm_idx = 0;
     p_cu->cu_info.directskip_mhp_idx = DS_NONE;
     p_cu->cu_info.dmh_mode = 0;
     cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, best);
 
-    /* ¼ì²éDMHÄ£Ê½ */
+    /* æ£€æŸ¥DMHæ¨¡å¼ */
     if (h->i_type == SLICE_TYPE_F && h->param->enable_dmh && !h->lcu.bypass_all_dmh && b_check_dmh
         && !(i_level == B8X8_IN_BIT && mode != PRED_2Nx2N)) {  // disable 8x4 or 4x8 2MVs/PU mode
         int dmh_mode_candidate = 0;
@@ -2232,26 +3068,26 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in
 
         if (p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD &&
             p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) {
-            /* MEÈ·¶¨µÄ×îÓÅµÄPUÔ¤²â·½Ïò¾ùÎªµ¥Ç°Ïò£¬´ËÊ±Ö»ÐèÒª¼ì²éºóÐøDMHÄ£Ê½ */
+            /* MEç¡®å®šçš„æœ€ä¼˜çš„PUé¢„æµ‹æ–¹å‘å‡ä¸ºå•å‰å‘ï¼Œæ­¤æ—¶åªéœ€è¦æ£€æŸ¥åŽç»­DMHæ¨¡å¼ */
             dmh_mode = 1;
-        } else { // DHP ¿ªÆôÇÒ²Î¿¼Ö¡ÊýÁ¿Îª2Ê±²ÅÓÐ¿ÉÄÜÉÏÊöÌõ¼þ²»³ÉÁ¢
-            /* ×îÓÅµÄPUÖÐ°üº¬Ë«Ç°Ïò¿é£¬´ËÊ±ÐèÒª¼ÆËãPU¾ùÎªµ¥Ç°ÏòÊ±µÄRDCosts£¬ÔÙ±éÀúºóÐøDMHÄ£Ê½ */
-            /* ´ËÊ±ÐèÖØÐÂME£¬Í¬Ê±µÚÒ»¸öPU²»ÐèÒªÖØÐÂËÑË÷ */
+        } else { // DHP å¼€å¯ä¸”å‚è€ƒå¸§æ•°é‡ä¸º2æ—¶æ‰æœ‰å¯èƒ½ä¸Šè¿°æ¡ä»¶ä¸æˆç«‹
+            /* æœ€ä¼˜çš„PUä¸­åŒ…å«åŒå‰å‘å—ï¼Œæ­¤æ—¶éœ€è¦è®¡ç®—PUå‡ä¸ºå•å‰å‘æ—¶çš„RDCostsï¼Œå†éåŽ†åŽç»­DMHæ¨¡å¼ */
+            /* æ­¤æ—¶éœ€é‡æ–°MEï¼ŒåŒæ—¶ç¬¬ä¸€ä¸ªPUä¸éœ€è¦é‡æ–°æœç´¢ */
             cu_rdo_motion_estimation(h, p_cu, &h->me_state, -1);
             dmh_mode = 0;
         }
 
-        /* ×Ü¼Æ 2 * (DMH_MODE_NUM - 1) + 1 ¸öÄ£Ê½ */
+        /* æ€»è®¡ 2 * (DMH_MODE_NUM - 1) + 1 ä¸ªæ¨¡å¼ */
         max_dmh_mode = DMH_MODE_NUM + DMH_MODE_NUM - 1;
 
-        /* ¿ìËÙËã·¨£¬´ÓDMH¿ÉÑ¡Ä£Ê½ÖÐ¹À¼Æ×îÐèÒª×öµÄÄ£Ê½
-            * ±ÜÃâÒÀ´Î±éÀúËùÓÐÄ£Ê½¾Þ´óµÄ¼ÆËãÁ¿
+        /* å¿«é€Ÿç®—æ³•ï¼Œä»ŽDMHå¯é€‰æ¨¡å¼ä¸­ä¼°è®¡æœ€éœ€è¦åšçš„æ¨¡å¼
+            * é¿å…ä¾æ¬¡éåŽ†æ‰€æœ‰æ¨¡å¼å·¨å¤§çš„è®¡ç®—é‡
             */
         if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) {
             dmh_mode_candidate = rdo_get_dmh_candidate(h, p_cu, *p_min_rdcost);
         }
 
-        // µ±Ä³¸öÄ£Ê½ÏÂµÄ²Ð²îÎªÈ«ÁãÊ±£¬Ìø¹ýËùÓÐºóÐødmhÄ£Ê½
+        // å½“æŸä¸ªæ¨¡å¼ä¸‹çš„æ®‹å·®ä¸ºå…¨é›¶æ—¶ï¼Œè·³è¿‡æ‰€æœ‰åŽç»­dmhæ¨¡å¼
         for (; dmh_mode < max_dmh_mode && !h->lcu.bypass_all_dmh; dmh_mode++) {
             if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) {
                 if (dmh_mode != 0 && dmh_mode != dmh_mode_candidate) {
@@ -2259,7 +3095,7 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in
                 }
             } else {
                 if (dmh_mode > (DMH_MODE_NUM - 1)) {
-                    if (best_dmh_mode != (dmh_mode - (DMH_MODE_NUM - 1))) { // Ö»ÔÚÍ¬·½ÏòÉÏÀ©Õ¹£¬ÆäËûÌø¹ý
+                    if (best_dmh_mode != (dmh_mode - (DMH_MODE_NUM - 1))) { // åªåœ¨åŒæ–¹å‘ä¸Šæ‰©å±•ï¼Œå…¶ä»–è·³è¿‡
                         continue;
                     }
                 }
@@ -2340,7 +3176,7 @@ typedef struct cu_skip_mc_t {
 } cu_skip_mc_t;
 
 /* ---------------------------------------------------------------------------
- * ¸üÐÂSkipµÄMV¼¯£¬ÒÔ¼ì²âµ±Ç°Ä£Ê½µÄMVÊÇ·ñ±»±éÀú¹ý
+ * æ›´æ–°Skipçš„MVé›†ï¼Œä»¥æ£€æµ‹å½“å‰æ¨¡å¼çš„MVæ˜¯å¦è¢«éåŽ†è¿‡
  */
 static ALWAYS_INLINE
 int is_same_skip_mc_param(const cu_skip_mc_t *p_src1, const cu_skip_mc_t *p_src2)
@@ -2360,7 +3196,7 @@ int is_same_skip_mc_param(const cu_skip_mc_t *p_src1, const cu_skip_mc_t *p_src2
 }
 
 /* ---------------------------------------------------------------------------
- * ¸üÐÂSkipµÄMV¼¯£¬ÒÔ¼ì²âµ±Ç°Ä£Ê½µÄMVÊÇ·ñ±»±éÀú¹ý
+ * æ›´æ–°Skipçš„MVé›†ï¼Œä»¥æ£€æµ‹å½“å‰æ¨¡å¼çš„MVæ˜¯å¦è¢«éåŽ†è¿‡
  */
 static
 int update_skip_mv_list(cu_skip_mc_t *p_skip_mvs, int i_num, cu_t *p_cu)
@@ -2389,7 +3225,7 @@ int update_skip_mv_list(cu_skip_mc_t *p_skip_mvs, int i_num, cu_t *p_cu)
 }
 
 /* ---------------------------------------------------------------------------
- * ¼ì²éSkip/DirectÄ£Ê½µÄ±àÂë´ú¼Û£¨ÒÀ¾ÝÔ¤²â²Ð²î£©£¬Ñ¡È¡×îÓÅµÄSkip×ÓÄ£Ê½½øÐÐÒ»´ÎRDO
+ * æ£€æŸ¥Skip/Directæ¨¡å¼çš„ç¼–ç ä»£ä»·ï¼ˆä¾æ®é¢„æµ‹æ®‹å·®ï¼‰ï¼Œé€‰å–æœ€ä¼˜çš„Skipå­æ¨¡å¼è¿›è¡Œä¸€æ¬¡RDO
  */
 static
 void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost)
@@ -2398,9 +3234,7 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
     int num_mc_params = 0;
     int max_skip_mode_num, i;
     int cu_size = p_cu->i_size;
-    pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)];
     cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
-    pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     dist_t distortion;
     rdcost_t rdcost;
     rdcost_t min_rdcost = MAX_COST;
@@ -2423,10 +3257,158 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
     p_cu->cu_info.directskip_mhp_idx = DS_NONE;
     p_cu->cu_info.directskip_wsm_idx = 0;
 
-    /* Ê±ÓòMVPÔ¤²âµÄÖ±½ÓËãRDCost£¬ÔÙ¸ú¿ÕÓòµÄ×îÓÅµÄRDCost×ö±È½Ï£¬ÔöÒæ 3%×óÓÒ£¬Ê±¼äÔö¼Ó 20%~30% */
+    /* æ—¶åŸŸMVPé¢„æµ‹çš„ç›´æŽ¥ç®—RDCostï¼Œå†è·Ÿç©ºåŸŸçš„æœ€ä¼˜çš„RDCoståšæ¯”è¾ƒï¼Œå¢žç›Š 3%å·¦å³ï¼Œæ—¶é—´å¢žåŠ  20%~30% */
     cu_set_mvs_skip(h, p_cu);
     cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
 
+    if (h->param->input_sample_bit_depth == 8) {
+    pixel8_ssd_t cmp_skip = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)];
+    pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
+    /* 2, Weighted skip mode, derive MV from temporal and scaling */
+    for (i = 1; i < max_skip_mode_num; i++) {
+        int need_check_mv;
+        p_cu->cu_info.directskip_wsm_idx = (int8_t)i;
+        cu_set_mvs_skip(h, p_cu);
+        cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+        need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+        num_mc_params += need_check_mv;
+        if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+            rate = p_aec->binary.est_cu_header(h, p_aec, p_cu);
+            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+            rdcost = distortion + h->f_lambda_mode * rate;
+            if (rdcost < min_rdcost) {
+                XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                min_rdcost = rdcost;
+                best_weighted_skip = i;
+            }
+        }
+    }
+
+    /* 3, å››ä¸ªspatial directç±»åž‹ (single first, single second, dual first, dual second) */
+    if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) {
+        p_cu->cu_info.directskip_wsm_idx = 0;
+        for (i = 0; i < DS_MAX_NUM; i++) {
+            int need_check_mv;
+            p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+            cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+            cu_set_mvs_skip(h, p_cu);
+            need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+            num_mc_params += need_check_mv;
+            if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+                rate = headerbits_skipmode[4+i];
+                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+                rdcost = distortion + h->f_lambda_mode * rate;
+                if (rdcost < min_rdcost) {
+                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                    min_rdcost = rdcost;
+                    best_weighted_skip = 0;
+                    best_skip_mode = i;
+                }
+            }
+        }
+        /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
+        p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
+        p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
+        cu_set_mvs_skip(h, p_cu);
+        cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+    } else if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (h->fdec->rps.poc == 2 || h->fdec->rps.poc == 6)) {
+        if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) {
+            if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 || p_cu->p_topA_cu->i_cbp == 0 || p_cu->p_topL_cu->i_cbp == 0 || p_cu->p_topR_cu->i_cbp == 0)) {
+                p_cu->cu_info.directskip_wsm_idx = 0;
+                for (i = 0; i < DS_MAX_NUM; i++) {
+                    int need_check_mv;
+                    p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                    cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                    cu_set_mvs_skip(h, p_cu);
+                    need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+                    num_mc_params += need_check_mv;
+                    if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+                        rate = headerbits_skipmode[4 + i];
+                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+                        rdcost = distortion + h->f_lambda_mode * rate;
+                        if (rdcost < min_rdcost) {
+                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                            min_rdcost = rdcost;
+                            best_weighted_skip = 0;
+                            best_skip_mode = i;
+                        }
+                    }
+                }
+                /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
+                p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
+                p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
+                cu_set_mvs_skip(h, p_cu);
+                cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+
+            } else {
+                p_cu->cu_info.directskip_wsm_idx = 0;
+                for (i = 0; i < DS_MAX_NUM; i++) {
+                    p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                    cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                    cu_set_mvs_skip(h, p_cu);
+                    cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+                }
+            }
+        } else {
+            p_cu->cu_info.directskip_wsm_idx = 0;
+            for (i = 0; i < DS_MAX_NUM; i++) {
+                p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                cu_set_mvs_skip(h, p_cu);
+                cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+            }
+        }
+    } else if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) {
+        if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) {
+            if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 && p_cu->p_topA_cu->i_cbp == 0 && p_cu->p_topL_cu->i_cbp == 0 && p_cu->p_topR_cu->i_cbp == 0)) {
+                p_cu->cu_info.directskip_wsm_idx = 0;
+                for (i = 0; i < DS_MAX_NUM; i++) {
+                    int need_check_mv;
+                    p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                    cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                    cu_set_mvs_skip(h, p_cu);
+                    need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+                    num_mc_params += need_check_mv;
+                    if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+                        rate = headerbits_skipmode[4 + i];
+                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+                        rdcost = distortion + h->f_lambda_mode * rate;
+                        if (rdcost < min_rdcost) {
+                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                            min_rdcost = rdcost;
+                            best_weighted_skip = 0;
+                            best_skip_mode = i;
+                        }
+                    }
+                }
+                /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
+                p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
+                p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
+                cu_set_mvs_skip(h, p_cu);
+                cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+
+            } else {
+                p_cu->cu_info.directskip_wsm_idx = 0;
+                for (i = 0; i < DS_MAX_NUM; i++) {
+                    p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                    cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                    cu_set_mvs_skip(h, p_cu);
+                    cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+                }
+            }
+        } else {
+            p_cu->cu_info.directskip_wsm_idx = 0;
+            for (i = 0; i < DS_MAX_NUM; i++) {
+                p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+                cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+                cu_set_mvs_skip(h, p_cu);
+                cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+            }
+        }
+    }
+    } else {
+    pixel10_ssd_t cmp_skip = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)];
+    pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     /* 2, Weighted skip mode, derive MV from temporal and scaling */
     for (i = 1; i < max_skip_mode_num; i++) {
         int need_check_mv;
@@ -2437,17 +3419,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
         num_mc_params += need_check_mv;
         if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
             rate = p_aec->binary.est_cu_header(h, p_aec, p_cu);
-            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
             rdcost = distortion + h->f_lambda_mode * rate;
             if (rdcost < min_rdcost) {
-                XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                 min_rdcost = rdcost;
                 best_weighted_skip = i;
             }
         }
     }
 
-    /* 3, ËÄ¸öspatial directÀàÐÍ (single first, single second, dual first, dual second) */
+    /* 3, å››ä¸ªspatial directç±»åž‹ (single first, single second, dual first, dual second) */
     if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) {
         p_cu->cu_info.directskip_wsm_idx = 0;
         for (i = 0; i < DS_MAX_NUM; i++) {
@@ -2459,17 +3441,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
             num_mc_params += need_check_mv;
             if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
                 rate = headerbits_skipmode[4+i];
-                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
                 rdcost = distortion + h->f_lambda_mode * rate;
                 if (rdcost < min_rdcost) {
-                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                     min_rdcost = rdcost;
                     best_weighted_skip = 0;
                     best_skip_mode = i;
                 }
             }
         }
-        /* ÔÚdistortion×îÐ¡µÄÄ£Ê½ÖÐÑ¡ÔñÒ»¸ö×îÓÅµÄ */
+        /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
         p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
         p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
         cu_set_mvs_skip(h, p_cu);
@@ -2487,17 +3469,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
                     num_mc_params += need_check_mv;
                     if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
                         rate = headerbits_skipmode[4 + i];
-                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
                         rdcost = distortion + h->f_lambda_mode * rate;
                         if (rdcost < min_rdcost) {
-                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                             min_rdcost = rdcost;
                             best_weighted_skip = 0;
                             best_skip_mode = i;
                         }
                     }
                 }
-                /* ÔÚdistortion×îÐ¡µÄÄ£Ê½ÖÐÑ¡ÔñÒ»¸ö×îÓÅµÄ */
+                /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
                 p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
                 p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
                 cu_set_mvs_skip(h, p_cu);
@@ -2534,17 +3516,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
                     num_mc_params += need_check_mv;
                     if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
                         rate = headerbits_skipmode[4 + i];
-                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+                        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
                         rdcost = distortion + h->f_lambda_mode * rate;
                         if (rdcost < min_rdcost) {
-                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                            XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                             min_rdcost = rdcost;
                             best_weighted_skip = 0;
                             best_skip_mode = i;
                         }
                     }
                 }
-                /* ÔÚdistortion×îÐ¡µÄÄ£Ê½ÖÐÑ¡ÔñÒ»¸ö×îÓÅµÄ */
+                /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
                 p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
                 p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
                 cu_set_mvs_skip(h, p_cu);
@@ -2569,6 +3551,7 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
             }
         }
     }
+    }
 }
 
 static
@@ -2578,9 +3561,7 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
     int num_mc_params = 0;
     int max_skip_mode_num, i;
     int cu_size = p_cu->i_size;
-    pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)];
     cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level);
-    pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     dist_t distortion;
     rdcost_t rdcost;
     rdcost_t min_rdcost = MAX_COST;
@@ -2603,15 +3584,80 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
     p_cu->cu_info.directskip_mhp_idx = DS_NONE;
     p_cu->cu_info.directskip_wsm_idx = 0;
 
-    /* Ê±ÓòMVPÔ¤²âµÄÖ±½ÓËãRDCost£¬ÔÙ¸ú¿ÕÓòµÄ×îÓÅµÄRDCost×ö±È½Ï£¬ÔöÒæ 3%×óÓÒ£¬Ê±¼äÔö¼Ó 20%~30% */
+    /* æ—¶åŸŸMVPé¢„æµ‹çš„ç›´æŽ¥ç®—RDCostï¼Œå†è·Ÿç©ºåŸŸçš„æœ€ä¼˜çš„RDCoståšæ¯”è¾ƒï¼Œå¢žç›Š 3%å·¦å³ï¼Œæ—¶é—´å¢žåŠ  20%~30% */
     cu_set_mvs_skip(h, p_cu);
     cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
     num_mc_params += update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+
+    if (h->param->input_sample_bit_depth == 8) {
+    pixel8_ssd_t cmp_skip = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)];
+    pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
+    if (rdo_get_pred_inter(h, p_cu, 1)) {
+        rate = headerbits_skipmode[0];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
+        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+        min_rdcost = distortion + h->f_lambda_mode * rate;
+        XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+        best_weighted_skip = 0;
+        best_skip_mode = DS_NONE;
+    }
+
+    /* 2, Weighted skip mode, derive MV from temporal and scaling */
+    for (i = 1; i < max_skip_mode_num; i++) {
+        int need_check_mv;
+        p_cu->cu_info.directskip_wsm_idx = (int8_t)i;
+        cu_set_mvs_skip(h, p_cu);
+        cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+        need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+        num_mc_params += need_check_mv;
+        if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+            rate = headerbits_skipmode[i];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
+            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+            rdcost = distortion + h->f_lambda_mode * rate;
+            if (rdcost < min_rdcost) {
+                XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                min_rdcost = rdcost;
+                best_weighted_skip = i;
+            }
+        }
+    }
+
+    /* 3, å››ä¸ªspatial directç±»åž‹ (single first, single second, dual first, dual second) */
+    if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) {
+        p_cu->cu_info.directskip_wsm_idx = 0;
+        for (i = 0; i < DS_MAX_NUM; i++) {
+            int need_check_mv;
+            p_cu->cu_info.directskip_mhp_idx = (int8_t)i;
+            cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP);
+            cu_set_mvs_skip(h, p_cu);
+            need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu);
+            num_mc_params += need_check_mv;
+            if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
+                rate = headerbits_skipmode[4 + i];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
+                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE);
+                rdcost = distortion + h->f_lambda_mode * rate;
+                if (rdcost < min_rdcost) {
+                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best);
+                    min_rdcost = rdcost;
+                    best_weighted_skip = 0;
+                    best_skip_mode = i;
+                }
+            }
+        }
+    }
+
+    /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
+    p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
+    p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
+    cu_set_mvs_skip(h, p_cu);
+    cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+    } else {
+    pixel10_ssd_t cmp_skip = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)];
+    pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x;
     if (rdo_get_pred_inter(h, p_cu, 1)) {
         rate = headerbits_skipmode[0];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
-        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+        distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
         min_rdcost = distortion + h->f_lambda_mode * rate;
-        XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+        XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
         best_weighted_skip = 0;
         best_skip_mode = DS_NONE;
     }
@@ -2626,17 +3672,17 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
         num_mc_params += need_check_mv;
         if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
             rate = headerbits_skipmode[i];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
-            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+            distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
             rdcost = distortion + h->f_lambda_mode * rate;
             if (rdcost < min_rdcost) {
-                XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                 min_rdcost = rdcost;
                 best_weighted_skip = i;
             }
         }
     }
 
-    /* 3, ËÄ¸öspatial directÀàÐÍ (single first, single second, dual first, dual second) */
+    /* 3, å››ä¸ªspatial directç±»åž‹ (single first, single second, dual first, dual second) */
     if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) {
         p_cu->cu_info.directskip_wsm_idx = 0;
         for (i = 0; i < DS_MAX_NUM; i++) {
@@ -2648,10 +3694,10 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
             num_mc_params += need_check_mv;
             if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) {
                 rate = headerbits_skipmode[4 + i];//p_aec->binary.est_cu_header(h, p_aec, p_cu);
-                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE);
+                distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE);
                 rdcost = distortion + h->f_lambda_mode * rate;
                 if (rdcost < min_rdcost) {
-                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best);
+                    XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best);
                     min_rdcost = rdcost;
                     best_weighted_skip = 0;
                     best_skip_mode = i;
@@ -2660,16 +3706,17 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu
         }
     }
 
-    /* ÔÚdistortion×îÐ¡µÄÄ£Ê½ÖÐÑ¡ÔñÒ»¸ö×îÓÅµÄ */
+    /* åœ¨distortionæœ€å°çš„æ¨¡å¼ä¸­é€‰æ‹©ä¸€ä¸ªæœ€ä¼˜çš„ */
     p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode;
     p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip;
     cu_set_mvs_skip(h, p_cu);
     cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
+    }
 }
 
 
 /* ---------------------------------------------------------------------------
- * ¼ì²éSkip/DirectÄ£Ê½µÄ±àÂë´ú¼Û£¨RDO£©£¬Ñ¡È¡×îÓÅµÄSkip×ÓÄ£Ê½
+ * æ£€æŸ¥Skip/Directæ¨¡å¼çš„ç¼–ç ä»£ä»·ï¼ˆRDOï¼‰ï¼Œé€‰å–æœ€ä¼˜çš„Skipå­æ¨¡å¼
  */
 static
 void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost)
@@ -2690,7 +3737,7 @@ void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, c
     p_cu->cu_info.directskip_mhp_idx = DS_NONE;
     p_cu->cu_info.directskip_wsm_idx = 0;
 
-    /* Ê±ÓòMVPÔ¤²âµÄÖ±½ÓËãRDCost£¬ÔÙ¸ú¿ÕÓòµÄ×îÓÅµÄRDCost×ö±È½Ï£¬ÔöÒæ 3%×óÓÒ£¬Ê±¼äÔö¼Ó 20%~30% */
+    /* æ—¶åŸŸMVPé¢„æµ‹çš„ç›´æŽ¥ç®—RDCostï¼Œå†è·Ÿç©ºåŸŸçš„æœ€ä¼˜çš„RDCoståšæ¯”è¾ƒï¼Œå¢žç›Š 3%å·¦å³ï¼Œæ—¶é—´å¢žåŠ  20%~30% */
     cu_set_mvs_skip(h, p_cu);
     cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
 
@@ -2701,7 +3748,7 @@ void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, c
         cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best);
     }
 
-    /* 3, ËÄ¸öspatial directÀàÐÍ (single first, single second, dual first, dual second) */
+    /* 3, å››ä¸ªspatial directç±»åž‹ (single first, single second, dual first, dual second) */
     if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) {
         p_cu->cu_info.directskip_wsm_idx = 0;
         for (i = 0; i < DS_MAX_NUM; i++) {
@@ -2984,7 +4031,11 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
 
     h->lcu.b_enable_rdoq     = (h->param->i_rdoq_level == RDOQ_ALL);
     h->lcu.b_2nd_rdcost_pass = 1;
-    h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma;
+    if (h->param->input_sample_bit_depth == 8) {
+    h->lcu.get_intra_dir_for_rdo_luma8 = h->get_intra_candidates_luma8;
+    } else {
+    h->lcu.get_intra_dir_for_rdo_luma10 = h->get_intra_candidates_luma10;
+    }
 
     //===== SET VALID MODES =====
     intra_modes = cu_get_valid_modes(h, h->i_type, i_level);
@@ -2997,11 +4048,11 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     //===== GET BEST MACROBLOCK MODE =====
     for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) {
         if (!(intra_modes & (1 << mode))) {
-            continue;           // Ö±½ÓÌø¹ý²»¿ÉÓÃÄ£Ê½
+            continue;           // ç›´æŽ¥è·³è¿‡ä¸å¯ç”¨æ¨¡å¼
         }
 
         if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) {
-            // ×îºóÒ»¸ö·Ç¶Ô³ÆÖ¡ÄÚÄ£Ê½µÄÌáÇ°Ìø¹ý
+            // æœ€åŽä¸€ä¸ªéžå¯¹ç§°å¸§å†…æ¨¡å¼çš„æå‰è·³è¿‡
             if (sdip_early_bypass(h, p_layer, mode)) {
                 continue;
             }
@@ -3013,9 +4064,13 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost);
     }
 
-    /* ¼ì²é×îÓÅÄ£Ê½£¬´øRDOQ */
+    /* æ£€æŸ¥æœ€ä¼˜æ¨¡å¼ï¼Œå¸¦RDOQ */
     if (h->param->i_rdoq_level == RDOQ_CU_LEVEL && best->i_cbp > 0) {
-        h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass;
+        if (h->param->input_sample_bit_depth == 8) {
+        h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass;
+        } else {
+        h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass;
+        }
         h->lcu.b_enable_rdoq = 1;
         mode = best->i_mode;
         cu_copy_info(&p_cu->cu_info, best);
@@ -3043,10 +4098,14 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     cu_layer_t *p_layer  = cu_get_layer(h, p_cu->cu_info.i_level);
 
     /* -------------------------------------------------------------
-     * 1, ³õÊ¼»¯
+     * 1, åˆå§‹åŒ–
      */
     UNUSED_PARAMETER(cost_limit);
-    h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma;
+    if (h->param->input_sample_bit_depth == 8) {
+    h->lcu.get_intra_dir_for_rdo_luma8 = h->get_intra_candidates_luma8;
+    } else {
+    h->lcu.get_intra_dir_for_rdo_luma10 = h->get_intra_candidates_luma10;
+    }
     h->enable_tu_2level = IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) ? 0 : 2;
     h->lcu.b_enable_rdoq      = (h->param->i_rdoq_level == RDOQ_ALL);
     h->lcu.b_2nd_rdcost_pass  = 0;
@@ -3056,12 +4115,12 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     }
 
     /* reset chroma intra predictor to default */
-    p_cu->cu_info.i_intra_mode_c = DC_PRED_C;   // @luofl£ºÇëÎðÒÆ³ý´ËÐÐ£¬·ñÔò»áµ¼ÖÂ²»Æ¥ÅäÎÊÌâ£»20170304 19:52:32
+    p_cu->cu_info.i_intra_mode_c = DC_PRED_C;   // @luoflï¼šè¯·å‹¿ç§»é™¤æ­¤è¡Œï¼Œå¦åˆ™ä¼šå¯¼è‡´ä¸åŒ¹é…é—®é¢˜ï¼›20170304 19:52:32
 
     /* -------------------------------------------------------------
-     * 2, ¼ì²éSkipºÍDirectÄ£Ê½
+     * 2, æ£€æŸ¥Skipå’ŒDirectæ¨¡å¼
      */
-    /* ¼ì²éËùÓÐSKIP/Direct×ÓÄ£Ê½ */
+    /* æ£€æŸ¥æ‰€æœ‰SKIP/Directå­æ¨¡å¼ */
     p_cu->cu_info.i_mode = PRED_SKIP;
 
     if (IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL) && h->skip_rough_improved) {
@@ -3087,15 +4146,15 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
 
 
     /* -------------------------------------------------------------
-     * 3, ·ÇSkip/DirectµÄÖ¡¼äÄ£Ê½
+     * 3, éžSkip/Directçš„å¸§é—´æ¨¡å¼
      */
     for (mode = 1; mode < MAX_INTER_MODES; mode++) {
         if (!(avail_modes & (1 << mode))) {
-            continue;           // Ö±½ÓÌø¹ý²»¿ÉÓÃÄ£Ê½µÄ¾ö²ß
+            continue;           // ç›´æŽ¥è·³è¿‡ä¸å¯ç”¨æ¨¡å¼çš„å†³ç­–
         }
 
         /* -------------------------------------------------------------
-         * 3.1 ÓëSkip/DirectÄ£Ê½Ïà¹ØµÄ¿ìËÙÄ£Ê½¾ö²ßËã·¨·ÅÔÚ´Ë´¦
+         * 3.1 ä¸ŽSkip/Directæ¨¡å¼ç›¸å…³çš„å¿«é€Ÿæ¨¡å¼å†³ç­–ç®—æ³•æ”¾åœ¨æ­¤å¤„
          */
 
 #if SAVE_CU_INFO
@@ -3107,8 +4166,8 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         }
 #endif
 
-        /* ¿ìËÙPU»®·ÖÄ£Ê½¾ö²ß£º
-         * Èç¹ûP2NxNÎ´»ñµÃ×îÓÅ£¬Ö±½ÓÌø¹ýÏàÍ¬»®·Ö·½ÏòµÄPRED_2NxnU/PRED_2NxnD; PNx2NÍ¬Àí */
+        /* å¿«é€ŸPUåˆ’åˆ†æ¨¡å¼å†³ç­–ï¼š
+         * å¦‚æžœP2NxNæœªèŽ·å¾—æœ€ä¼˜ï¼Œç›´æŽ¥è·³è¿‡ç›¸åŒåˆ’åˆ†æ–¹å‘çš„PRED_2NxnU/PRED_2NxnD; PNx2NåŒç† */
         if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) {
             if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best->i_mode != PRED_2NxN) {
                 continue;
@@ -3119,7 +4178,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
 
 
         /* -------------------------------------------------------------
-         * 3.2, ³¢ÊÔ±àÂëµ±Ç°PU»®·ÖÄ£Ê½
+         * 3.2, å°è¯•ç¼–ç å½“å‰PUåˆ’åˆ†æ¨¡å¼
          */
         p_cu->cu_info.i_mode = (int8_t)mode;
         if (IS_ALG_ENABLE(OPT_ROUGH_PU_SEL) && mode == PRED_2Nx2N) {
@@ -3127,15 +4186,15 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
             cu_select_inter_partition(h, p_cu, i_level, avail_modes, &cur_best, &min_rdcost, b_dhp_enabled, b_check_dmh);
             mode = cur_best.i_mode;
             cu_copy_info(&p_cu->cu_info, &cur_best);
-            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc_tmp, sizeof(p_cu->mc));  /* ¿½±´MVÐÅÏ¢ÓÃÓÚ²¹³¥ */
+            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc_tmp, sizeof(p_cu->mc));  /* æ‹·è´MVä¿¡æ¯ç”¨äºŽè¡¥å¿ */
             cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best);
-            avail_modes &= ~0xfe;   // ½ûÓÃµôÊ£ÓàÖ¡¼ä»®·ÖÄ£Ê½
+            avail_modes &= ~0xfe;   // ç¦ç”¨æŽ‰å‰©ä½™å¸§é—´åˆ’åˆ†æ¨¡å¼
         } else {
             cu_check_inter_partition(h, p_aec, p_cu, mode, i_level, best, &min_rdcost, b_dhp_enabled, b_check_dmh);
         }
 
         /* -------------------------------------------------------------
-         * 3.3, µ±Ç°ÆÕÍ¨PU»®·ÖÄ£Ê½±àÂëºóµÄ¿ìËÙ¾ö²ßËã·¨
+         * 3.3, å½“å‰æ™®é€šPUåˆ’åˆ†æ¨¡å¼ç¼–ç åŽçš„å¿«é€Ÿå†³ç­–ç®—æ³•
          */
 
         if (best->i_mode == mode) {
@@ -3165,29 +4224,29 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         }
     }
 
-    /* ×öµÚ¶þ²ãTU»®·Ö£¬Ñ¡³ö×îÓÅÄ£Ê½ */
+    /* åšç¬¬äºŒå±‚TUåˆ’åˆ†ï¼Œé€‰å‡ºæœ€ä¼˜æ¨¡å¼ */
     if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) && best->i_cbp > 0) {
         h->enable_tu_2level = 1;
         mode = best->i_mode;
         cu_copy_info(&p_cu->cu_info, best);
-        memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* ¿½±´MVÐÅÏ¢ÓÃÓÚ²¹³¥ */
+        memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* æ‹·è´MVä¿¡æ¯ç”¨äºŽè¡¥å¿ */
         cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best);
     }// end of checking inter PU partitions
 
-    /* Í¨¹ýÖ¡¼¶Ô¤·ÖÎöÅÐ¶¨£¬´ËÖ¡²»ÐèÒª×öÖ¡ÄÚÔ¤²âÊ±£¬Ìø¹ýºóÐøÖ¡ÄÚÄ£Ê½ */
+    /* é€šè¿‡å¸§çº§é¢„åˆ†æžåˆ¤å®šï¼Œæ­¤å¸§ä¸éœ€è¦åšå¸§å†…é¢„æµ‹æ—¶ï¼Œè·³è¿‡åŽç»­å¸§å†…æ¨¡å¼ */
     if (!h->fenc->b_enable_intra) {
         b_bypass_intra = 1;
     }
 
     if (IS_ALG_ENABLE(OPT_BYPASS_INTRA_BPIC)) {
-        b_bypass_intra |= (h->i_type == SLICE_TYPE_B && best->i_cbp == 0);   // ½ûÓÃBÖ¡µÄÖ¡ÄÚÔ¤²âÄ£Ê½
+        b_bypass_intra |= (h->i_type == SLICE_TYPE_B && best->i_cbp == 0);   // ç¦ç”¨Bå¸§çš„å¸§å†…é¢„æµ‹æ¨¡å¼
     }
 
-    /* Ìõ¼þ½ûÓÃ²¿·ÖÖ¡ÄÚ»®·ÖÄ£Ê½ */
+    /* æ¡ä»¶ç¦ç”¨éƒ¨åˆ†å¸§å†…åˆ’åˆ†æ¨¡å¼ */
     if (IS_ALG_ENABLE(OPT_CMS_ETMD)) {
-        /* Ö¡¼äÄ£Ê½×öÍêÖ®ºó£¬Èô×îÓÅÄ£Ê½µÄCBPÎªÁã£¬Ôò²»ÔÙ±éÀúËùÓÐÖ¡ÄÚÔ¤²âÄ£Ê½ */
+        /* å¸§é—´æ¨¡å¼åšå®Œä¹‹åŽï¼Œè‹¥æœ€ä¼˜æ¨¡å¼çš„CBPä¸ºé›¶ï¼Œåˆ™ä¸å†éåŽ†æ‰€æœ‰å¸§å†…é¢„æµ‹æ¨¡å¼ */
         b_bypass_intra |= ((best->i_cbp == 0) && (best->i_mode == 0));
-        /* ÒÀ¾ÝÖ¡¼ä×îÓÅ»®·ÖÄ£Ê½£¬É¸Ñ¡²»ÐèÒª±éÀúµÄÄ£Ê½ */
+        /* ä¾æ®å¸§é—´æœ€ä¼˜åˆ’åˆ†æ¨¡å¼ï¼Œç­›é€‰ä¸éœ€è¦éåŽ†çš„æ¨¡å¼ */
         // if (IS_HOR_PU_PART(best->i_mode)) {
         //     avail_modes &= !(1 << PRED_I_nx2N);
         // } else if (IS_VER_PU_PART(best->i_mode)) {
@@ -3207,7 +4266,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         }
     }
 
-    /* Èôµ±Ç°×îÐ¡RDCostÐ¡ÓÚÁËÄ³¸öãÐÖµ£¬±íÃ÷Ö¡¼äÔ¤²âÄ£Ê½ÒÑ¾­ÄÜ¹»½ÏºÃµØÔ¤²â£¬´ËÊ±²»ÔÙ¼ÌÐø³¢ÊÔÖ¡ÄÚÄ£Ê½ */
+    /* è‹¥å½“å‰æœ€å°RDCostå°äºŽäº†æŸä¸ªé˜ˆå€¼ï¼Œè¡¨æ˜Žå¸§é—´é¢„æµ‹æ¨¡å¼å·²ç»èƒ½å¤Ÿè¾ƒå¥½åœ°é¢„æµ‹ï¼Œæ­¤æ—¶ä¸å†ç»§ç»­å°è¯•å¸§å†…æ¨¡å¼ */
     if (IS_ALG_ENABLE(OPT_FAST_INTRA_IN_INTER) && min_rdcost < h->thres_qsfd_cu[1][i_level - MIN_CU_SIZE_IN_BIT]) {
         b_bypass_intra = 1;
     }
@@ -3218,11 +4277,11 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
     if (!b_bypass_intra) {
         for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) {
             if (!(avail_modes & (1 << mode))) {
-                continue;           // Ö±½ÓÌø¹ý²»¿ÉÓÃÄ£Ê½µÄ¾ö²ß
+                continue;           // ç›´æŽ¥è·³è¿‡ä¸å¯ç”¨æ¨¡å¼çš„å†³ç­–
             }
 
             if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) {
-                // ×îºóÒ»¸ö·Ç¶Ô³ÆÖ¡ÄÚÄ£Ê½µÄÌáÇ°Ìø¹ý
+                // æœ€åŽä¸€ä¸ªéžå¯¹ç§°å¸§å†…æ¨¡å¼çš„æå‰è·³è¿‡
                 if (sdip_early_bypass(h, p_layer, mode)) {
                     continue;
                 }
@@ -3242,14 +4301,18 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         }
     }
 
-    /* ¼ì²é×îÓÅÄ£Ê½,°üÀ¨TU»®·Ö»¹ÊÇ²»»®·ÖµÄÈ·¶¨£¬´øRDOQ */
+    /* æ£€æŸ¥æœ€ä¼˜æ¨¡å¼,åŒ…æ‹¬TUåˆ’åˆ†è¿˜æ˜¯ä¸åˆ’åˆ†çš„ç¡®å®šï¼Œå¸¦RDOQ */
     if (h->param->i_rdoq_level == RDOQ_CU_LEVEL&& best->i_cbp > 0) {
         if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC)) {
             h->enable_tu_2level = 3;
         } else {
             h->enable_tu_2level = 2;
         }
-        h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass;
+        if (h->param->input_sample_bit_depth == 8) {
+        h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass;
+        } else {
+        h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass;
+        }
         h->lcu.b_enable_rdoq = 1;
         h->lcu.b_2nd_rdcost_pass = 1;
         mode = best->i_mode;
@@ -3259,12 +4322,16 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
                 cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost);
             }
         } else {
-            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* ¿½±´MVÐÅÏ¢ÓÃÓÚ²¹³¥ */
+            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* æ‹·è´MVä¿¡æ¯ç”¨äºŽè¡¥å¿ */
             cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best);
         }
     } else if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && i_level >= 5 && (best->i_mode != PRED_SKIP || best->i_cbp != 0)) {
         h->enable_tu_2level = 2;
-        h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass;
+        if (h->param->input_sample_bit_depth == 8) {
+        h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass;
+        } else {
+        h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass;
+        }
         h->lcu.b_2nd_rdcost_pass = 1;
         // recheck RDCost
         mode = best->i_mode;
@@ -3272,7 +4339,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best
         if (IS_INTRA_MODE(mode)) {
             cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost);
         } else {
-            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* ¿½±´MVÐÅÏ¢ÓÃÓÚ²¹³¥ */
+            memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc));  /* æ‹·è´MVä¿¡æ¯ç”¨äºŽè¡¥å¿ */
             cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best);
         }
     }
@@ -3288,14 +4355,20 @@ int ctu_intra_depth_pred_mad(xavs2_t *h, int level, int pix_x, int pix_y)
     static const int MAD_TH0[] = {
         2, 2 * 256, 2 * 1024, 3 * 4096
     };
-    pel_t *p_src_base = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x;
     int cu_size = 1 << level;
 
-    int mad = g_funcs.pixf.madf[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size);
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_src_base = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x;
+    int mad = g_funcs.pixf.madf8[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size);
 
     return mad >= MAD_TH0[level - MIN_CU_SIZE_IN_BIT];
-}
+    } else {
+    pel10_t *p_src_base = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x;
+    int mad = g_funcs.pixf.madf10[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size);
 
+    return mad >= MAD_TH0[level - MIN_CU_SIZE_IN_BIT];
+    }
+}
 
 /**
  * ===========================================================================
@@ -3304,7 +4377,7 @@ int ctu_intra_depth_pred_mad(xavs2_t *h, int level, int pix_x, int pix_y)
  */
 
 /* ---------------------------------------------------------------------------
- * RDOPT³õÊ¼»¯Ê±£¬ÉèÖÃ²»Í¬Ö¡ºÍCU´óÐ¡¿ÉÓÃµÄÄ£Ê½£¬ºóÐøÖ±½Ó²é±í
+ * RDOPTåˆå§‹åŒ–æ—¶ï¼Œè®¾ç½®ä¸åŒå¸§å’ŒCUå¤§å°å¯ç”¨çš„æ¨¡å¼ï¼ŒåŽç»­ç›´æŽ¥æŸ¥è¡¨
  */
 void xavs2_init_valid_mode_table(xavs2_t *h)
 {
@@ -3466,7 +4539,7 @@ rdcost_t compress_ctu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, i
             b_split_ctu &= !is_ET_inter_recur(h, p_cu, best);
         }
 
-        /* µ±Ç°CUºÍÉÏÒ»²ãCUµÄ×îÓÅÄ£Ê½¾ùÎªSKIPÄ£Ê½£¬ÔòÌø¹ýÏÂ²ãCUµÄ»®·Ö @ÕÅÓñ»± */
+        /* å½“å‰CUå’Œä¸Šä¸€å±‚CUçš„æœ€ä¼˜æ¨¡å¼å‡ä¸ºSKIPæ¨¡å¼ï¼Œåˆ™è·³è¿‡ä¸‹å±‚CUçš„åˆ’åˆ† @å¼ çŽ‰æ§ */
         if (IS_ALG_ENABLE(OPT_CU_CSET) &&
             ((p_cu->i_size <= 16 && h->i_type == SLICE_TYPE_B) || (p_cu->i_size <= 32 && h->fdec->rps.referd_by_others == 0))) {
             cu_layer_t *p_ulayer = cu_get_layer(h, i_level + 1);
@@ -3524,7 +4597,7 @@ rdcost_t compress_ctu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, i
     if (IS_ALG_ENABLE(OPT_SUBCU_SPLIT)) {
         if ((p_cu->sub_cu[0] != NULL) && (p_cu->sub_cu[1] != NULL) && (p_cu->sub_cu[2] != NULL) && (p_cu->sub_cu[3] != NULL)) {
             if (((p_cu->sub_cu[0]->is_ctu_split + p_cu->sub_cu[1]->is_ctu_split + p_cu->sub_cu[2]->is_ctu_split + p_cu->sub_cu[3]->is_ctu_split) >= 3)) {
-                b_check_large_cu = FALSE;   // 1080p 20% ½ÚÊ¡£¬Ô¼1.7%ËðÊ§£¬preset 6£¬1080p
+                b_check_large_cu = FALSE;   // 1080p 20% èŠ‚çœï¼Œçº¦1.7%æŸå¤±ï¼Œpreset 6ï¼Œ1080p
             }
             /* else if (((!p_cu->sub_cu[0]->is_ctu_split) && ((p_cu->sub_cu[0]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[0]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[0]->cu_info.i_cbp == 0)))
             && ((!p_cu->sub_cu[1]->is_ctu_split) && ((p_cu->sub_cu[1]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[1]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[1]->cu_info.i_cbp == 0)))
diff --git a/source/encoder/sao.c b/source/encoder/sao.c
index f47d676..6ffdd13 100644
--- a/source/encoder/sao.c
+++ b/source/encoder/sao.c
@@ -41,7 +41,9 @@
 #include "filter.h"
 #include "cpu.h"
 #include "cudata.h"
+#if HAVE_MMX
 #include "vec/intrinsic.h"
+#endif
 
 static const int tab_sao_check_mode_fast[3][5] = {
     1, 1, 0, 0, 0,
@@ -65,7 +67,7 @@ static ALWAYS_INLINE void sao_init_stat_data(SAOStatData *p_stats)
 /* ---------------------------------------------------------------------------
  */
 static
-void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+void sao_get_stat_block_EO_0(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                              SAOStatData *p_stats, sao_region_t *p_region, int compIdx)
 {
     int start_x, end_x, start_y, end_y;
@@ -81,10 +83,11 @@ void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
 
     int i_rec = frm_rec->i_stride[compIdx];
     int i_org = frm_org->i_stride[compIdx];
-    const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x;
-    const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x;
-    const pel_t *p_org_iter;
-    const pel_t *p_rec_iter;
+    if (h->param->input_sample_bit_depth == 8) {
+    const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x;
+    const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x;
+    const pel8_t *p_org_iter;
+    const pel8_t *p_rec_iter;
     sao_init_stat_data(p_stats);
     p_org_iter = p_org;
     p_rec_iter = p_rec;
@@ -106,12 +109,39 @@ void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
         p_rec_iter += i_rec;
         p_org_iter += i_org;
     }
+    } else {
+    const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x;
+    const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x;
+    const pel10_t *p_org_iter;
+    const pel10_t *p_rec_iter;\
+    sao_init_stat_data(p_stats);
+    p_org_iter = p_org;
+    p_rec_iter = p_rec;
+    start_y = 0;
+    end_y = height;
+    start_x = p_region->b_left ? 0 : 1;
+    end_x = p_region->b_right ? width : (width - 1);
+    p_org_iter = p_org + start_y * i_org;
+    p_rec_iter += start_y * i_rec;
+    for (y = start_y; y < end_y; y++) {
+        leftsign = xavs2_sign3(p_rec_iter[start_x] - p_rec_iter[start_x - 1]);
+        for (x = start_x; x < end_x; x++) {
+            rightsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1]);
+            edgetype = leftsign + rightsign;
+            leftsign = -rightsign;
+            p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+            p_stats->count[edgetype + 2]++;
+        }
+        p_rec_iter += i_rec;
+        p_org_iter += i_org;
+    }
+    }
 }
 
 /* ---------------------------------------------------------------------------
 */
 static
-void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+void sao_get_stat_block_EO_90(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                               SAOStatData *p_stats, sao_region_t *p_region, int compIdx)
 {
     int start_x, end_x, start_y, end_y;
@@ -127,10 +157,11 @@ void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
 
     int i_rec = frm_rec->i_stride[compIdx];
     int i_org = frm_org->i_stride[compIdx];
-    const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x;
-    const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x;
-    const pel_t *p_org_iter;
-    const pel_t *p_rec_iter;
+    if (h->param->input_sample_bit_depth == 8) {
+    const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x;
+    const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x;
+    const pel8_t *p_org_iter;
+    const pel8_t *p_rec_iter;
 
     sao_init_stat_data(p_stats);
 
@@ -150,12 +181,37 @@ void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
             p_stats->count[edgetype + 2]++;
         }
     }
+    } else {
+    const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x;
+    const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x;
+    const pel10_t *p_org_iter;
+    const pel10_t *p_rec_iter;
+
+    sao_init_stat_data(p_stats);
+
+    p_org_iter = p_org;
+    p_rec_iter = p_rec;
+    start_x = 0;
+    end_x = width;
+    start_y = p_region->b_top ? 0 : 1;
+    end_y = p_region->b_down ? height : (height - 1);
+    for (x = start_x; x < end_x; x++) {
+        upsign = xavs2_sign3(p_rec_iter[start_y * i_rec + x] - p_rec_iter[(start_y - 1) * i_rec + x]);
+        for (y = start_y; y < end_y; y++) {
+            downsign = xavs2_sign3(p_rec_iter[y * i_rec + x] - p_rec_iter[(y + 1) * i_rec + x]);
+            edgetype = downsign + upsign;
+            upsign = -downsign;
+            p_stats->diff[edgetype + 2] += (p_org_iter[y * i_org + x] - p_rec_iter[y * i_rec + x]);
+            p_stats->count[edgetype + 2]++;
+        }
+    }
+    }
 }
 
 /* ---------------------------------------------------------------------------
 */
 static
-void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+void sao_get_stat_block_EO_135(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                                SAOStatData *p_stats, sao_region_t *p_region, int compIdx)
 {
     int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn;
@@ -173,10 +229,71 @@ void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
 
     int i_rec = frm_rec->i_stride[compIdx];
     int i_org = frm_org->i_stride[compIdx];
-    const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x;
-    const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x;
-    const pel_t *p_org_iter;
-    const pel_t *p_rec_iter;
+    if (h->param->input_sample_bit_depth == 8) {
+    const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x;
+    const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x;
+    const pel8_t *p_org_iter;
+    const pel8_t *p_rec_iter;
+
+    sao_init_stat_data(p_stats);
+
+    p_org_iter = p_org;
+    p_rec_iter = p_rec;
+    start_x_r0 = p_region->b_top_left ? 0 : 1;
+    end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1;
+    start_x_r = p_region->b_left ? 0 : 1;
+    end_x_r = p_region->b_right ? width : (width - 1);
+    start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1);
+    end_x_rn = p_region->b_right_down ? width : (width - 1);
+
+    // init the line buffer
+    for (x = start_x_r + 1; x < end_x_r + 1; x++) {
+        upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x - 1]);
+        signupline[x] = upsign;
+    }
+    // first row
+    for (x = start_x_r0; x < end_x_r0; x++) {
+        upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]);
+        edgetype = upsign - signupline[x + 1];
+        p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+        p_stats->count[edgetype + 2]++;
+    }
+
+    // middle rows
+    p_rec_iter += i_rec;
+    p_org_iter += i_org;
+    for (y = 1; y < height - 1; y++) {
+        for (x = start_x_r; x < end_x_r; x++) {
+            if (x == start_x_r) {
+                upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]);
+                signupline[x] = upsign;
+            }
+            downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]);
+            edgetype = downsign + signupline[x];
+            p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+            p_stats->count[edgetype + 2]++;
+            signupline[x] = (char)reg;
+            reg = -downsign;
+        }
+        p_rec_iter += i_rec;
+        p_org_iter += i_org;
+    }
+    // last row
+    for (x = start_x_rn; x < end_x_rn; x++) {
+        if (x == start_x_r) {
+            upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]);
+            signupline[x] = upsign;
+        }
+        downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]);
+        edgetype = downsign + signupline[x];
+        p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+        p_stats->count[edgetype + 2]++;
+    }
+    } else {
+    const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x;
+    const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x;
+    const pel10_t *p_org_iter;
+    const pel10_t *p_rec_iter;
 
     sao_init_stat_data(p_stats);
 
@@ -232,12 +349,13 @@ void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
         p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
         p_stats->count[edgetype + 2]++;
     }
+    }
 }
 
 /* ---------------------------------------------------------------------------
 */
 static
-void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+void sao_get_stat_block_EO_45(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                               SAOStatData *p_stats, sao_region_t *p_region, int compIdx)
 {
     int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn;
@@ -255,10 +373,11 @@ void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
 
     int i_rec = frm_rec->i_stride[compIdx];
     int i_org = frm_org->i_stride[compIdx];
-    const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x;
-    const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x;
-    const pel_t *p_org_iter;
-    const pel_t *p_rec_iter;
+    if (h->param->input_sample_bit_depth == 8) {
+    const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x;
+    const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x;
+    const pel8_t *p_org_iter;
+    const pel8_t *p_rec_iter;
 
     sao_init_stat_data(p_stats);
 
@@ -313,12 +432,72 @@ void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
         p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
         p_stats->count[edgetype + 2]++;
     }
+    } else {
+    const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x;
+    const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x;
+    const pel10_t *p_org_iter;
+    const pel10_t *p_rec_iter;
+
+    sao_init_stat_data(p_stats);
+
+    p_org_iter = p_org;
+    p_rec_iter = p_rec;
+    start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1);
+    end_x_r0 = p_region->b_top_right ? width : (width - 1);
+    start_x_r = p_region->b_left ? 0 : 1;
+    end_x_r = p_region->b_right ? width : (width - 1);
+    start_x_rn = p_region->b_down_left ? 0 : 1;
+    end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1;
+
+    // init the line buffer
+    signupline1 = signupline + 1;
+    for (x = start_x_r - 1; x < XAVS2_MAX(end_x_r - 1, end_x_r0 - 1); x++) {
+        upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x + 1]);
+        signupline1[x] = upsign;
+    }
+    // first row
+    for (x = start_x_r0; x < end_x_r0; x++) {
+        upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]);
+        edgetype = upsign - signupline1[x - 1];
+        p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+        p_stats->count[edgetype + 2]++;
+    }
+
+    // middle rows
+    p_rec_iter += i_rec;
+    p_org_iter += i_org;
+    for (y = 1; y < height - 1; y++) {
+        for (x = start_x_r; x < end_x_r; x++) {
+            if (x == end_x_r - 1) {
+                upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]);
+                signupline1[x] = upsign;
+            }
+            downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]);
+            edgetype = downsign + signupline1[x];
+            p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+            p_stats->count[edgetype + 2]++;
+            signupline1[x - 1] = -downsign;
+        }
+        p_rec_iter += i_rec;
+        p_org_iter += i_org;
+    }
+    for (x = start_x_rn; x < end_x_rn; x++) {
+        if (x == end_x_r - 1) {
+            upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]);
+            signupline1[x] = upsign;
+        }
+        downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]);
+        edgetype = downsign + signupline1[x];
+        p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]);
+        p_stats->count[edgetype + 2]++;
+    }
+    }
 }
 
 /* ---------------------------------------------------------------------------
 */
 static
-void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+void sao_get_stat_block_BO(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                            SAOStatData *p_stats, sao_region_t *p_region, int compIdx)
 {
     int start_x, end_x, start_y, end_y;
@@ -334,16 +513,41 @@ void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
 
     int i_rec = frm_rec->i_stride[compIdx];
     int i_org = frm_org->i_stride[compIdx];
-    const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x;
-    const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x;
-    const pel_t *p_org_iter;
-    const pel_t *p_rec_iter;
+    if (h->param->input_sample_bit_depth == 8) {
+    const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x;
+    const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x;
+    const pel8_t *p_org_iter;
+    const pel8_t *p_rec_iter;
+
+    sao_init_stat_data(p_stats);
+
+    p_org_iter = p_org;
+    p_rec_iter = p_rec;
+    band_shift = (h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT);
+    start_x = 0;
+    end_x = width;
+    start_y = 0;
+    end_y = height;
+    for (y = start_y; y < end_y; y++) {
+        for (x = start_x; x < end_x; x++) {
+            bandtype = p_rec_iter[x] >> band_shift;
+            p_stats->diff[bandtype] += (p_org_iter[x] - p_rec_iter[x]);
+            p_stats->count[bandtype]++;
+        }
+        p_rec_iter += i_rec;
+        p_org_iter += i_org;
+    }
+    } else {
+    const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x;
+    const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x;
+    const pel10_t *p_org_iter;
+    const pel10_t *p_rec_iter;
 
     sao_init_stat_data(p_stats);
 
     p_org_iter = p_org;
     p_rec_iter = p_rec;
-    band_shift = (g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT);
+    band_shift = (h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT);
     start_x = 0;
     end_x = width;
     start_y = 0;
@@ -357,11 +561,12 @@ void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
         p_rec_iter += i_rec;
         p_org_iter += i_org;
     }
+    }
 }
 
 /* ---------------------------------------------------------------------------
 */
-typedef void(*sao_pf)(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
+typedef void(*sao_pf)(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org,
                       SAOStatData *stat_datas, sao_region_t *p_region, int compIdx);
 
 sao_pf gf_sao_stat[5] = {
@@ -517,7 +722,7 @@ static void find_offset(int typeIdc, SAOStatData *p_stat, SAOBlkParam *p_param,
         start_band2 = XAVS2_MAX(best_start_band1, best_start_band2);
         delta_band12 = (start_band2 - start_band1);
         if (delta_band12 > (NUM_SAO_BO_CLASSES >> 1)) {
-            p_param->deltaBand = 32 - delta_band12;  // TODO: ÕâÀïÓ¦¸ÃÊÇ (32 + delta_band12)
+            p_param->deltaBand = 32 - delta_band12;  // TODO: è¿™é‡Œåº”è¯¥æ˜¯ (32 + delta_band12)
             p_param->startBand = start_band2;
         } else {
             p_param->deltaBand = delta_band12;
@@ -733,8 +938,168 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
 
     int i_src = h->img_sao->i_stride[compIdx];
     int i_dst = h->fdec->i_stride[compIdx];
-    pel_t *dst = h->fdec->planes[compIdx] + pix_y * i_dst + pix_x;
-    pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *dst = h->fdec->planes8[compIdx] + pix_y * i_dst + pix_x;
+    pel8_t *src = h->img_sao->planes8[compIdx] + pix_y * i_src + pix_x;
+
+    assert(blk_param->typeIdc != SAO_TYPE_OFF);
+
+    switch (blk_param->typeIdc) {
+    case SAO_TYPE_EO_0:
+        end_y = height;
+        start_x = p_region->b_left ? 0 : 1;
+        end_x = p_region->b_right ? width : (width - 1);
+        for (y = 0; y < end_y; y++) {
+            leftsign = xavs2_sign3(src[start_x] - src[start_x - 1]);
+            for (x = start_x; x < end_x; x++) {
+                rightsign = xavs2_sign3(src[x] - src[x + 1]);
+                edgetype = leftsign + rightsign;
+                leftsign = -rightsign;
+                dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+            }
+            src += i_src;
+            dst += i_dst;
+        }
+        break;
+    case SAO_TYPE_EO_90: {
+        pel8_t *src_base = src;
+        pel8_t *dst_base = dst;
+        start_x = 0;
+        end_x = width;
+        start_y = p_region->b_top ? 0 : 1;
+        end_y = p_region->b_down ? height : (height - 1);
+
+        src_base += start_y * i_src;
+        dst_base += start_y * i_dst;
+        for (x = start_x; x < end_x; x++) {
+            src = src_base;
+            dst = dst_base;
+            upsign = xavs2_sign3(src[0] - src[-i_src]);
+            for (y = start_y; y < end_y; y++) {
+                downsign = xavs2_sign3(src[0] - src[i_src]);
+                edgetype = downsign + upsign;
+                upsign = -downsign;
+                *dst = (pel8_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]);
+                src += i_src;
+                dst += i_dst;
+            }
+            src_base++;
+            dst_base++;
+        }
+        break;
+    }
+    case SAO_TYPE_EO_135: {
+        start_x_r0 = p_region->b_top_left ? 0 : 1;
+        end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1;
+        start_x_r = p_region->b_left ? 0 : 1;
+        end_x_r = p_region->b_right ? width : (width - 1);
+        start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1);
+        end_x_rn = p_region->b_right_down ? width : (width - 1);
+
+        // init the line buffer
+        for (x = start_x_r + 1; x < end_x_r + 1; x++) {
+            signupline[x] = xavs2_sign3(src[x + i_src] - src[x - 1]);
+        }
+        // first row
+        for (x = start_x_r0; x < end_x_r0; x++) {
+            upsign = xavs2_sign3(src[x] - src[x - 1 - i_src]);
+            edgetype = upsign - signupline[x + 1];
+            dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+        }
+        // middle rows
+        src += i_src;
+        dst += i_dst;
+        for (y = 1; y < height - 1; y++) {
+            x = start_x_r;
+            signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]);
+            for (; x < end_x_r; x++) {
+                downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]);
+                edgetype = downsign + signupline[x];
+                dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+                signupline[x] = reg;
+                reg = -downsign;
+            }
+            dst += i_dst;
+            src += i_src;
+        }
+        // last row
+        x = start_x_rn;
+        signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]);
+        for (; x < end_x_rn; x++) {
+            downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]);
+            edgetype = downsign + signupline[x];
+            dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+        }
+    }
+    break;
+    case SAO_TYPE_EO_45: {
+        start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1);
+        end_x_r0 = p_region->b_top_right ? width : (width - 1);
+        start_x_r = p_region->b_left ? 0 : 1;
+        end_x_r = p_region->b_right ? width : (width - 1);
+        start_x_rn = p_region->b_down_left ? 0 : 1;
+        end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1;
+
+        // init the line buffer
+        for (x = start_x_r; x < end_x_r; x++) {
+            signupline[x] = xavs2_sign3(src[x - 1 + i_src] - src[x]);
+        }
+        // first row
+        for (x = start_x_r0; x < end_x_r0; x++) {
+            upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]);
+            edgetype = upsign - signupline[x];
+            dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+        }
+        // middle rows
+        src += i_src;
+        dst += i_dst;
+        for (y = 1; y < height - 1; y++) {
+            signupline[end_x_r] = xavs2_sign3(src[end_x_r - 1] - src[end_x_r - i_src]);
+            for (x = start_x_r; x < end_x_r; x++) {
+                downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]);
+                edgetype = downsign + signupline[x + 1];
+                dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+                signupline[x] = -downsign;
+            }
+            src += i_src;
+            dst += i_dst;
+        }
+        //last row
+        for (x = start_x_rn; x < end_x_rn; x++) {
+            if (x == end_x_r - 1) {
+                upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]);
+                signupline[x + 1] = upsign;
+            }
+            downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]);
+            edgetype = downsign + signupline[x + 1];
+            dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+        }
+        break;
+    }
+    case SAO_TYPE_BO:
+        band_shift = (h->param->sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT);
+        start_x = 0;
+        end_x = width;
+        start_y = 0;
+        end_y = height;
+        src += start_y * i_src;
+        dst += start_y * i_dst;
+        for (y = start_y; y < end_y; y++) {
+            for (x = start_x; x < end_x; x++) {
+                bandtype = src[x] >> band_shift;
+                dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]);
+            }
+            src += i_src;
+            dst += i_dst;
+        }
+        break;
+    default:
+        xavs2_log(h, XAVS2_LOG_ERROR, "Not a supported SAO types for SAO_on_Block\n");
+        exit(-1);
+    }
+    } else {
+    pel10_t *dst = h->fdec->planes10[compIdx] + pix_y * i_dst + pix_x;
+    pel10_t *src = h->img_sao->planes10[compIdx] + pix_y * i_src + pix_x;
 
     assert(blk_param->typeIdc != SAO_TYPE_OFF);
 
@@ -749,15 +1114,15 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
                 rightsign = xavs2_sign3(src[x] - src[x + 1]);
                 edgetype = leftsign + rightsign;
                 leftsign = -rightsign;
-                dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+                dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
             }
             src += i_src;
             dst += i_dst;
         }
         break;
     case SAO_TYPE_EO_90: {
-        pel_t *src_base = src;
-        pel_t *dst_base = dst;
+        pel10_t *src_base = src;
+        pel10_t *dst_base = dst;
         start_x = 0;
         end_x = width;
         start_y = p_region->b_top ? 0 : 1;
@@ -773,7 +1138,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
                 downsign = xavs2_sign3(src[0] - src[i_src]);
                 edgetype = downsign + upsign;
                 upsign = -downsign;
-                *dst = (pel_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]);
+                *dst = (pel10_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]);
                 src += i_src;
                 dst += i_dst;
             }
@@ -798,7 +1163,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
         for (x = start_x_r0; x < end_x_r0; x++) {
             upsign = xavs2_sign3(src[x] - src[x - 1 - i_src]);
             edgetype = upsign - signupline[x + 1];
-            dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+            dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
         }
         // middle rows
         src += i_src;
@@ -809,7 +1174,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
             for (; x < end_x_r; x++) {
                 downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]);
                 edgetype = downsign + signupline[x];
-                dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+                dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
                 signupline[x] = reg;
                 reg = -downsign;
             }
@@ -822,7 +1187,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
         for (; x < end_x_rn; x++) {
             downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]);
             edgetype = downsign + signupline[x];
-            dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+            dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
         }
     }
     break;
@@ -842,7 +1207,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
         for (x = start_x_r0; x < end_x_r0; x++) {
             upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]);
             edgetype = upsign - signupline[x];
-            dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+            dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
         }
         // middle rows
         src += i_src;
@@ -852,7 +1217,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
             for (x = start_x_r; x < end_x_r; x++) {
                 downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]);
                 edgetype = downsign + signupline[x + 1];
-                dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+                dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
                 signupline[x] = -downsign;
             }
             src += i_src;
@@ -866,7 +1231,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
             }
             downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]);
             edgetype = downsign + signupline[x + 1];
-            dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
+            dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]);
         }
         break;
     }
@@ -881,7 +1246,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
         for (y = start_y; y < end_y; y++) {
             for (x = start_x; x < end_x; x++) {
                 bandtype = src[x] >> band_shift;
-                dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]);
+                dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]);
             }
             src += i_src;
             dst += i_dst;
@@ -891,6 +1256,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s
         xavs2_log(h, XAVS2_LOG_ERROR, "Not a supported SAO types for SAO_on_Block\n");
         exit(-1);
     }
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -907,7 +1273,7 @@ static void sao_get_neighbor_avail(xavs2_t *h, sao_region_t *p_avail, int i_lcu_
     int width_c = width >> 1;
     int height_c = height >> 1;
 
-    /* ¿ÉÓÃÐÔ»ñÈ¡ */
+    /* å¯ç”¨æ€§èŽ·å– */
     p_avail->b_left = i_lcu_x != 0;
     p_avail->b_top  = i_lcu_y != 0;
     p_avail->b_right = (i_lcu_x < h->i_width_in_lcu - 1);
@@ -928,7 +1294,7 @@ static void sao_get_neighbor_avail(xavs2_t *h, sao_region_t *p_avail, int i_lcu_
     p_avail->b_down_left = p_avail->b_down && p_avail->b_left;
     p_avail->b_right_down = p_avail->b_down && p_avail->b_right;
 
-    /* ÂË²¨ÇøÓòµÄµ÷Õû */
+    /* æ»¤æ³¢åŒºåŸŸçš„è°ƒæ•´ */
     if (!p_avail->b_right) {
         width += SAO_SHIFT_PIX_NUM;
         width_c += SAO_SHIFT_PIX_NUM;
@@ -1098,16 +1464,46 @@ void sao_copy_lcu(xavs2_t *h, xavs2_frame_t *frm_dst, xavs2_frame_t *frm_src, in
     int lcu_height;
     int i_first_lcu_y_for_filter = h->param->b_cross_slice_loop_filter ? 0 : h->slices[h->i_slice_index]->i_first_lcu_y;
     int start_y_shift = (lcu_y != i_first_lcu_y_for_filter) ? SAO_SHIFT_PIX_NUM : 0;
-    pel_t *p_src;
-    pel_t *p_dst;
-    pel_t *p_src2, *p_dst2;
+    if (h->param->input_sample_bit_depth == 8) {
+    pel8_t *p_src;
+    pel8_t *p_dst;
+    pel8_t *p_src2, *p_dst2;
+
+    /* luma component */
+    start_y -= start_y_shift;
+    lcu_height = end_y - start_y;
+    p_src = frm_src->planes8[0] + start_y * i_src + start_x;
+    p_dst = frm_dst->planes8[0] + start_y * i_dst + start_x;
+    g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
+
+    /* chroma component */
+    start_y = lcu_y << (h->i_lcu_level - CHROMA_V_SHIFT);
+    start_y -= start_y_shift;
+    end_y   >>= CHROMA_V_SHIFT;
+    start_x >>= CHROMA_V_SHIFT;
+    end_x   >>= CHROMA_V_SHIFT;
+
+    lcu_width  = end_x - start_x;
+    lcu_height = end_y - start_y;
+    i_src = frm_src->i_stride[1];
+    i_dst = frm_dst->i_stride[1];
+    p_src  = frm_src->planes8[1] + start_y * i_src + start_x;
+    p_src2 = frm_src->planes8[2] + start_y * i_src + start_x;
+    p_dst  = frm_dst->planes8[1] + start_y * i_dst + start_x;
+    p_dst2 = frm_dst->planes8[2] + start_y * i_dst + start_x;
+    g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
+    g_funcs.plane_copy8(h, p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height);
+    } else {
+    pel10_t *p_src;
+    pel10_t *p_dst;
+    pel10_t *p_src2, *p_dst2;
 
     /* luma component */
     start_y -= start_y_shift;
     lcu_height = end_y - start_y;
-    p_src = frm_src->planes[0] + start_y * i_src + start_x;
-    p_dst = frm_dst->planes[0] + start_y * i_dst + start_x;
-    g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
+    p_src = frm_src->planes10[0] + start_y * i_src + start_x;
+    p_dst = frm_dst->planes10[0] + start_y * i_dst + start_x;
+    g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
 
     /* chroma component */
     start_y = lcu_y << (h->i_lcu_level - CHROMA_V_SHIFT);
@@ -1120,12 +1516,13 @@ void sao_copy_lcu(xavs2_t *h, xavs2_frame_t *frm_dst, xavs2_frame_t *frm_src, in
     lcu_height = end_y - start_y;
     i_src = frm_src->i_stride[1];
     i_dst = frm_dst->i_stride[1];
-    p_src  = frm_src->planes[1] + start_y * i_src + start_x;
-    p_src2 = frm_src->planes[2] + start_y * i_src + start_x;
-    p_dst  = frm_dst->planes[1] + start_y * i_dst + start_x;
-    p_dst2 = frm_dst->planes[2] + start_y * i_dst + start_x;
-    g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
-    g_funcs.plane_copy(p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height);
+    p_src  = frm_src->planes10[1] + start_y * i_src + start_x;
+    p_src2 = frm_src->planes10[2] + start_y * i_src + start_x;
+    p_dst  = frm_dst->planes10[1] + start_y * i_dst + start_x;
+    p_dst2 = frm_dst->planes10[2] + start_y * i_dst + start_x;
+    g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height);
+    g_funcs.plane_copy10(h, p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height);
+    }
 }
 
 /* ---------------------------------------------------------------------------
@@ -1144,7 +1541,7 @@ void sao_get_lcu_param_after_deblock(xavs2_t *h, aec_t *p_aec, int i_lcu_x, int
             for (type = 0; type < 5; type++) {
                 if (!h->param->b_fast_sao || tab_sao_check_mode_fast[compIdx][type]) {
                     if (((!IS_ALG_ENABLE(OPT_FAST_SAO)) || (!(!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)))) {
-                        gf_sao_stat[type](h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], &region, compIdx);
+                        gf_sao_stat[type](h, h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], &region, compIdx);
                     }
                     // SAOStatData tmp;
                     // memset(&tmp, 0, sizeof(tmp));
@@ -1182,8 +1579,9 @@ void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int l
         int pix_x = region.pix_x[compIdx];
         int i_dst = h->fdec->i_stride[compIdx];
         int i_src = h->img_sao->i_stride[compIdx];
-        pel_t *dst = h->fdec->planes[compIdx]    + pix_y * i_dst + pix_x;
-        pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x;
+        if (h->param->input_sample_bit_depth == 8) {
+        pel8_t *dst = h->fdec->planes8[compIdx]    + pix_y * i_dst + pix_x;
+        pel8_t *src = h->img_sao->planes8[compIdx] + pix_y * i_src + pix_x;
         int avail[8];
         avail[0] = region.b_top;
         avail[1] = region.b_down;
@@ -1193,11 +1591,26 @@ void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int l
         avail[5] = region.b_top_right;
         avail[6] = region.b_down_left;
         avail[7] = region.b_right_down;
-        g_funcs.sao_block(dst, i_dst, src, i_src,
+        g_funcs.sao_block8(h, dst, i_dst, src, i_src,
                           region.width[compIdx], region.height[compIdx],
                           avail, &p_param[compIdx]);
-
-    }
+        } else {
+        pel10_t *dst = h->fdec->planes10[compIdx]    + pix_y * i_dst + pix_x;
+        pel10_t *src = h->img_sao->planes10[compIdx] + pix_y * i_src + pix_x;
+       int avail[8];
+        avail[0] = region.b_top;
+        avail[1] = region.b_down;
+        avail[2] = region.b_left;
+        avail[3] = region.b_right;
+        avail[4] = region.b_top_left;
+        avail[5] = region.b_top_right;
+        avail[6] = region.b_down_left;
+        avail[7] = region.b_right_down;
+        g_funcs.sao_block10(h, dst, i_dst, src, i_src,
+                          region.width[compIdx], region.height[compIdx],
+                          avail, &p_param[compIdx]);
+        }
+     }
 }
 
 
diff --git a/source/encoder/slice.c b/source/encoder/slice.c
index 5ab94ce..d06da65 100644
--- a/source/encoder/slice.c
+++ b/source/encoder/slice.c
@@ -65,7 +65,7 @@ extern int g_bit_count;         /* global bit    count for trace */
 
 
 /* ---------------------------------------------------------------------------
- * ³õÊ¼»¯LCUÐÐµÄ±àÂëË³Ðò
+ * åˆå§‹åŒ–LCUè¡Œçš„ç¼–ç é¡ºåº
  */
 void slice_lcu_row_order_init(xavs2_t *h)
 {
@@ -119,7 +119,7 @@ void slice_lcu_row_order_init(xavs2_t *h)
                 p_slice = h->slices[idx_slice];
             }
         }
-    }   // Ä¬ÈÏÐÐ¼¶Ë³Ðò
+    }   // é»˜è®¤è¡Œçº§é¡ºåº
 }
 
 /* ---------------------------------------------------------------------------
@@ -388,7 +388,11 @@ void *xavs2_lcu_row_write(void *arg)
 
     h->lcu.get_skip_mvs = g_funcs.get_skip_mv_predictors[h->i_type];
     if (h->param->slice_num > 1) {
-        slice_init_bufer(h, slice);
+        if (h->param->input_sample_bit_depth == 8) {
+        slice_init_bufer8(h, slice);
+        } else {
+        slice_init_bufer10(h, slice);
+        }
     }
 
     /* loop over all LCUs in current lcu row ------------------------
@@ -588,7 +592,11 @@ void xavs2_slice_write_start(xavs2_t *h)
     aec_start(h, p_aec, slice->bs.p_start + PSEUDO_CODE_SIZE, slice->bs.p_end, 0);
 
     /* init slice buffers */
-    slice_init_bufer(h, slice);
+    if (h->param->input_sample_bit_depth == 8) {
+    slice_init_bufer8(h, slice);
+    } else {
+    slice_init_bufer10(h, slice);
+    }
 
     /* prediction mode is set to -1 outside the frame,
      * indicating that no prediction can be made from this part */
diff --git a/source/encoder/slice.h b/source/encoder/slice.h
index 5e7e2ba..af4d5ea 100644
--- a/source/encoder/slice.h
+++ b/source/encoder/slice.h
@@ -44,31 +44,42 @@
  * ===========================================================================
  */
 typedef struct slice_row_index_t {
-    int16_t lcu_y;       /* ÐÐ±àºÅ */
-    int8_t  slice_idx;   /* ÐÐËùÔÚµÄSliceË÷ÒýºÅ */
-    int8_t  row_type;    /* 0: Slice¿ªÊ¼Î»ÖÃµÄÐÐ£»1:ÆÕÍ¨£»2: Slice½áÊøÎ»ÖÃµÄÐÐ */
+    int16_t lcu_y;       /* è¡Œç¼–å· */
+    int8_t  slice_idx;   /* è¡Œæ‰€åœ¨çš„Sliceç´¢å¼•å· */
+    int8_t  row_type;    /* 0: Sliceå¼€å§‹ä½ç½®çš„è¡Œï¼›1:æ™®é€šï¼›2: Sliceç»“æŸä½ç½®çš„è¡Œ */
 } slice_row_index_t;
 
 extern slice_row_index_t g_slice_lcu_row_order[1024];
 
 /* ---------------------------------------------------------------------------
- * ³õÊ¼»¯Slice¼¶µÄbufferÖ¸Õë
+ * åˆå§‹åŒ–Sliceçº§çš„bufferæŒ‡é’ˆ
  */
 static ALWAYS_INLINE
-void slice_init_bufer(xavs2_t *h, slice_t *slice)
+void slice_init_bufer8(xavs2_t *h, slice_t *slice)
 {
     /* init slice buffers */
     h->ipredmode         = slice->slice_ipredmode;
-    h->intra_border[0]   = slice->slice_intra_border[0];
-    h->intra_border[1]   = slice->slice_intra_border[1];
-    h->intra_border[2]   = slice->slice_intra_border[2];
+    h->intra_border8[0]   = slice->slice_intra_border8[0];
+    h->intra_border8[1]   = slice->slice_intra_border8[1];
+    h->intra_border8[2]   = slice->slice_intra_border8[2];
     h->p_deblock_flag[0] = slice->slice_deblock_flag[0];
     h->p_deblock_flag[1] = slice->slice_deblock_flag[1];
 }
 
+static ALWAYS_INLINE
+void slice_init_bufer10(xavs2_t *h, slice_t *slice)
+{
+    /* init slice buffers */
+    h->ipredmode         = slice->slice_ipredmode;
+    h->intra_border10[0]   = slice->slice_intra_border10[0];
+    h->intra_border10[1]   = slice->slice_intra_border10[1];
+    h->intra_border10[2]   = slice->slice_intra_border10[2];
+    h->p_deblock_flag[0] = slice->slice_deblock_flag[0];
+    h->p_deblock_flag[1] = slice->slice_deblock_flag[1];
+}
 
 /* ---------------------------------------------------------------------------
- * µÈ´ýÒ»ÐÐLCU±àÂëÍêÖ¸¶¨ÊýÁ¿µÄLCU
+ * ç­‰å¾…ä¸€è¡ŒLCUç¼–ç å®ŒæŒ‡å®šæ•°é‡çš„LCU
  */
 static ALWAYS_INLINE
 void wait_lcu_row_coded(row_info_t *last_row, int wait_lcu_coded)
@@ -84,7 +95,7 @@ void wait_lcu_row_coded(row_info_t *last_row, int wait_lcu_coded)
 
 
 /* ---------------------------------------------------------------------------
- * ²éÑ¯Ò»ÐÐLCUÊÇ·ñÒÑ±àÂëÍê±Ï
+ * æŸ¥è¯¢ä¸€è¡ŒLCUæ˜¯å¦å·²ç¼–ç å®Œæ¯•
  */
 static ALWAYS_INLINE
 int is_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row)
@@ -93,7 +104,7 @@ int is_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row)
 }
 
 /* ---------------------------------------------------------------------------
- * ²éÑ¯Ò»ÐÐLCUÊÇ·ñÒÑ±àÂëÍê±Ï
+ * æŸ¥è¯¢ä¸€è¡ŒLCUæ˜¯å¦å·²ç¼–ç å®Œæ¯•
  */
 static ALWAYS_INLINE
 void set_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row)
@@ -114,8 +125,8 @@ void xavs2e_release_row_task(row_info_t *row)
         xavs2_handler_t *h_mgr = h->h_top;
         int b_slice_boundary_done = FALSE;
 
-        /* Èç¹û´ËÊ±Slice±ß½çµÄÏàÁÚÐÐÒÑ´¦ÀíÍê£¬ÔòÖ±½Ó½øÐÐ²åÖµ£¬²»ÐèÒª¼ÓËø
-         * ·ñÔò£¬ÐèÒª¼ÓËøºó½øÐÐ´¦Àí£¬±ÜÃâ³öÏÖÎÊÌâ */
+        /* å¦‚æžœæ­¤æ—¶Sliceè¾¹ç•Œçš„ç›¸é‚»è¡Œå·²å¤„ç†å®Œï¼Œåˆ™ç›´æŽ¥è¿›è¡Œæ’å€¼ï¼Œä¸éœ€è¦åŠ é”
+         * å¦åˆ™ï¼Œéœ€è¦åŠ é”åŽè¿›è¡Œå¤„ç†ï¼Œé¿å…å‡ºçŽ°é—®é¢˜ */
         if (h->param->b_cross_slice_loop_filter == FALSE) {
             if (row->b_top_slice_border && row->row > 0) {
                 if (is_lcu_row_finished(h, fdec, row->row - 1)) {
@@ -131,7 +142,7 @@ void xavs2e_release_row_task(row_info_t *row)
                 }
             }
         } else {
-            /* TODO: ¶àSlice²¢ÐÐÊ±£¬¶ÔSlice±ß½çµÄ´¦Àí */
+            /* TODO: å¤šSliceå¹¶è¡Œæ—¶ï¼Œå¯¹Sliceè¾¹ç•Œçš„å¤„ç† */
             if (h->param->slice_num > 1) {
                 xavs2_log(NULL, XAVS2_LOG_ERROR, "CrossSliceLoopFilter not supported now!\n");
                 assert(0);
@@ -156,7 +167,7 @@ void xavs2e_release_row_task(row_info_t *row)
                 }
             }
         } else {
-            /* TODO: ¶àSlice²¢ÐÐÊ±£¬¶ÔSlice±ß½çµÄ´¦Àí */
+            /* TODO: å¤šSliceå¹¶è¡Œæ—¶ï¼Œå¯¹Sliceè¾¹ç•Œçš„å¤„ç† */
         }
         set_lcu_row_finished(h, fdec, row->row);
         xavs2_thread_mutex_unlock(&fdec->mutex);         /* unlock */
@@ -229,7 +240,7 @@ xavs2_t *xavs2e_alloc_row_task(xavs2_t *h)
                 memcpy(&h_row_coder->row_vars_1, &h->row_vars_1, (uint8_t *)&h->row_vars_2 - (uint8_t *)&h->row_vars_1);
 
                 /* make the state of the aec engine same as the one when the slice starts */
-                /* ÕâÀïh->aecµÄÎ»ÖÃ²»Í¬µ¼ÖÂÐÔÄÜ²»Ò»Ñù£¬µ«ÊÇÔÚLCUÐÐ±àÂëÊ±ÖØÐÂ×öÁËÍ¬²½±£Ö¤ÁËÒ»ÖÂÐÔ */
+                /* è¿™é‡Œh->aecçš„ä½ç½®ä¸åŒå¯¼è‡´æ€§èƒ½ä¸ä¸€æ ·ï¼Œä½†æ˜¯åœ¨LCUè¡Œç¼–ç æ—¶é‡æ–°åšäº†åŒæ­¥ä¿è¯äº†ä¸€è‡´æ€§ */
                 aec_copy_aec_state(&h_row_coder->aec, &h->aec);
                 /* unlock */
                 xavs2_thread_mutex_unlock(&h_mgr->mutex);
diff --git a/source/encoder/tdrdo.c b/source/encoder/tdrdo.c
index 0acf2f0..68b4b1b 100644
--- a/source/encoder/tdrdo.c
+++ b/source/encoder/tdrdo.c
@@ -54,7 +54,8 @@ typedef struct Frame {
     uint32_t    FrameWidth;
     uint32_t    FrameHeight;
     uint32_t    nStrideY;
-    pel_t      *Y_base;
+    pel10_t      *Y_base10;
+    pel8_t    *Y_base;
 } Frame;
 
 typedef struct BlockDistortion {
@@ -163,15 +164,37 @@ static DL *CreatDistortionList(DL *NewDL, uint32_t totalframenumber, uint32_t wi
 
 /* ---------------------------------------------------------------------------
  */
-static double CalculateBlockMSE(Frame *FA, Frame *FB, Block *A, Block *B)
+static double CalculateBlockMSE8(Frame *FA, Frame *FB, Block *A, Block *B)
 {
     uint16_t x, y;
     int e, blockpixel = A->BlockHeight * A->BlockWidth;
-    pel_t *YA, *YB;
     double dSSE = 0;
 
+    pel8_t *YA, *YB;
     YA = FA->Y_base + A->OriginY * FA->nStrideY + A->OriginX;
     YB = FB->Y_base + B->OriginY * FB->nStrideY + B->OriginX;
+
+    for (y = 0; y < A->BlockHeight; y++) {
+        for (x = 0; x < A->BlockWidth; x++) {
+            e = YA[x] - YB[x];
+            dSSE += e * e;
+        }
+        YA = YA + FA->nStrideY;
+        YB = YB + FB->nStrideY;
+    }
+    return dSSE / blockpixel;
+}
+
+static double CalculateBlockMSE10(Frame *FA, Frame *FB, Block *A, Block *B)
+{
+    uint16_t x, y;
+    int e, blockpixel = A->BlockHeight * A->BlockWidth;
+    double dSSE = 0;
+
+    pel10_t *YA, *YB;
+    YA = FA->Y_base10 + A->OriginY * FA->nStrideY + A->OriginX;
+    YB = FB->Y_base10 + B->OriginY * FB->nStrideY + B->OriginX;
+
     for (y = 0; y < A->BlockHeight; y++) {
         for (x = 0; x < A->BlockWidth; x++) {
             e = YA[x] - YB[x];
@@ -185,7 +208,7 @@ static double CalculateBlockMSE(Frame *FA, Frame *FB, Block *A, Block *B)
 
 /* ---------------------------------------------------------------------------
  */
-static void MotionDistortion(FD *currentFD, Frame *FA, Frame *FB, uint32_t searchrange)
+static void MotionDistortion(xavs2_t *h, FD *currentFD, Frame *FA, Frame *FB, uint32_t searchrange)
 {
     static int dlx[9] = {0, -2, -1,  0,  1, 2, 1, 0, -1};
     static int dly[9] = {0,  0, -1, -2, -1, 0, 1, 2,  1};
@@ -267,13 +290,23 @@ static void MotionDistortion(FD *currentFD, Frame *FA, Frame *FB, uint32_t searc
                     if (x >= left && x <= right && y >= top && y <= bottom) {
                         pBB->OriginX = x;
                         pBB->OriginY = y;
-                        currentMSE = CalculateBlockMSE(FA, FB, pBA, pBB);
+                        if (h->param->input_sample_bit_depth == 8) {
+                        currentMSE = CalculateBlockMSE8(FA, FB, pBA, pBB);
                         if (currentMSE < candidateMSE) {
                             candidateMSE = currentMSE;
                             currentBD->MSE = currentMSE;
                             nextcx = x;
                             nextcy = y;
                         }
+                        } else {
+                        currentMSE = CalculateBlockMSE10(FA, FB, pBA, pBB);
+                        if (currentMSE < candidateMSE) {
+                            candidateMSE = currentMSE;
+                            currentBD->MSE = currentMSE;
+                            nextcx = x;
+                            nextcy = y;
+                        }
+                        }
                     }
                 }
                 if (cy == nextcy && cx == nextcx) {
@@ -621,20 +654,37 @@ void tdrdo_frame_start(xavs2_t *h)
     }
     td_rdo->pRealFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pRealFD->TotalNumOfBlocks, sizeof(BD));
     if (td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0) {
+        if (h->param->input_sample_bit_depth == 8) {
         if (h->fenc->i_frame == 0) {
-            td_rdo->porgF.Y_base   = h->fenc->planes[IMG_Y];
+            td_rdo->porgF.Y_base   = h->fenc->planes8[IMG_Y];
             td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y];
-            td_rdo->ppreF.Y_base   = h->img_luma_pre->planes[IMG_Y];
+            td_rdo->ppreF.Y_base   = h->img_luma_pre->planes8[IMG_Y];
             td_rdo->ppreF.nStrideY = h->img_luma_pre->i_stride[IMG_Y];
             xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc);
         } else  if ((int)h->fenc->i_frame < h->param->num_frames) {
             td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[td_rdo->GlobeFrameNumber - 1];
             td_rdo->pOMCPFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pOMCPFD->TotalNumOfBlocks, sizeof(BD));
-            td_rdo->porgF.Y_base = h->fenc->planes[IMG_Y];
+            td_rdo->porgF.Y_base = h->fenc->planes8[IMG_Y];
             td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y];
-            MotionDistortion(td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE);
+            MotionDistortion(h, td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE);
             xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc);
         }
+        } else {
+        if (h->fenc->i_frame == 0) {
+            td_rdo->porgF.Y_base10   = h->fenc->planes10[IMG_Y];
+            td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y];
+            td_rdo->ppreF.Y_base10   = h->img_luma_pre->planes10[IMG_Y];
+            td_rdo->ppreF.nStrideY = h->img_luma_pre->i_stride[IMG_Y];
+            xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc);
+        } else  if ((int)h->fenc->i_frame < h->param->num_frames) {
+            td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[td_rdo->GlobeFrameNumber - 1];
+            td_rdo->pOMCPFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pOMCPFD->TotalNumOfBlocks, sizeof(BD));
+            td_rdo->porgF.Y_base10 = h->fenc->planes10[IMG_Y];
+            td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y];
+            MotionDistortion(h, td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE);
+            xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc);
+        }
+        }
         td_rdo->pOMCPFD = NULL;
     }
 
@@ -653,10 +703,14 @@ void tdrdo_frame_done(xavs2_t *h)
     assert(td_rdo != NULL);
 
     if ((h->fenc->i_frame % td_rdo->StepLength == 0 && !h->param->num_bframes) || h->param->num_bframes) {
-        td_rdo->precF.Y_base = h->fdec->planes[IMG_Y];
+        if (h->param->input_sample_bit_depth == 8) {
+        td_rdo->precF.Y_base = h->fdec->planes8[IMG_Y];
+        } else {
+        td_rdo->precF.Y_base10 = h->fdec->planes10[IMG_Y];
+        }
         //td_rdo->precF.nStrideY = h->fdec->i_stride[IMG_Y];// fdec->stride[0] , bitrate rise ?
         td_rdo->precF.nStrideY = h->img_luma_pre->i_stride[IMG_Y];   //to check: fdec->stride[0] ? by lutao
-        MotionDistortion(td_rdo->pRealFD, &td_rdo->porgF, &td_rdo->precF, 0);
+        MotionDistortion(h, td_rdo->pRealFD, &td_rdo->porgF, &td_rdo->precF, 0);
     }
     td_rdo->pRealFD->FrameNumber = h->fenc->i_frame;
     td_rdo->globenumber++;
@@ -706,7 +760,7 @@ void tdrdo_lcu_adjust_lambda(xavs2_t *h, rdcost_t *new_lambda)
     // Just for LDP
     if (h->i_type != SLICE_TYPE_I && h->param->num_bframes == 0) {
         AdjustLcuQPLambdaLDP(h, td_rdo->pOMCPFD, h->lcu.i_scu_xy, h->i_width_in_mincu, new_lambda);
-        td_rdo->CurMBQP = XAVS2_CLIP3F(MIN_QP, MAX_QP, td_rdo->CurMBQP);
+        td_rdo->CurMBQP = XAVS2_CLIP3F(MIN_QP, MAX_QP + (h->param->sample_bit_depth - 8) * 8, td_rdo->CurMBQP);
     }
 }
 
diff --git a/source/encoder/wrapper.h b/source/encoder/wrapper.h
index 13c3f7e..4f75d51 100644
--- a/source/encoder/wrapper.h
+++ b/source/encoder/wrapper.h
@@ -48,7 +48,8 @@
  */
 
 // function type
-typedef void(*vpp_ipred_t)(pel_t *p_pred, pel_t *p_top, pel_t *p_left);
+typedef void(*vpp_ipred8_t)(pel8_t *p_pred, pel8_t *p_top, pel8_t *p_left);
+typedef void(*vpp_ipred10_t)(pel10_t *p_pred, pel10_t *p_top, pel10_t *p_left);
 
 /* ---------------------------------------------------------------------------
  * lookahead_t
@@ -63,26 +64,44 @@ typedef struct lookahead_t {
 /* ---------------------------------------------------------------------------
  * low resolution of frame (luma plane)
  */
-typedef struct frm_lowres_t {
+typedef struct frm_lowres8_t {
     int         i_width;              /* width  for luma plane */
     int         i_lines;              /* height for luma plane */
     int         i_stride;             /* stride for luma plane */
-    pel_t      *filtered;             /* half-size copy of input frame (luma only) */
-} frm_lowres_t;
+    pel8_t      *filtered8;             /* half-size copy of input frame (luma only) */
+} frm_lowres8_t;
+
+typedef struct frm_lowres10_t {
+    int         i_width;              /* width  for luma plane */
+    int         i_lines;              /* height for luma plane */
+    int         i_stride;             /* stride for luma plane */
+    pel10_t      *filtered10;             /* half-size copy of input frame (luma only) */
+} frm_lowres10_t;
 
 /* ---------------------------------------------------------------------------
  * video pre-processing motion estimation
  */
-typedef struct vpp_me_t {
+typedef struct vpp8_me_t {
+    int             mv_min[2];        /* full pel MV range for motion search (min) */
+    int             mv_max[2];        /* full pel MV range for motion search (max) */
+    mv_t            bmv;              /* [OUT] best motion vector */
+    mv_t            pmv;              /* pred motion vector for the current block */
+    uint16_t       *mvbits;           /* used for getting the mv bits */
+    pixel8_cmp_t     sad8_8x8;          /* function handle for cal sad of 8x8 block */
+    pixel8_cmp_x3_t  sad8_8x8_x3;       /* function handle for cal sad of 8x8 block (X3) */
+    pixel8_cmp_x4_t  sad8_8x8_x4;       /* function handle for cal sad of 8x8 block (X4) */
+} vpp8_me_t;
+
+typedef struct vpp10_me_t {
     int             mv_min[2];        /* full pel MV range for motion search (min) */
     int             mv_max[2];        /* full pel MV range for motion search (max) */
     mv_t            bmv;              /* [OUT] best motion vector */
     mv_t            pmv;              /* pred motion vector for the current block */
     uint16_t       *mvbits;           /* used for getting the mv bits */
-    pixel_cmp_t     sad_8x8;          /* function handle for cal sad of 8x8 block */
-    pixel_cmp_x3_t  sad_8x8_x3;       /* function handle for cal sad of 8x8 block (X3) */
-    pixel_cmp_x4_t  sad_8x8_x4;       /* function handle for cal sad of 8x8 block (X4) */
-} vpp_me_t;
+    pixel10_cmp_t     sad10_8x8;          /* function handle for cal sad of 8x8 block */
+    pixel10_cmp_x3_t  sad10_8x8_x3;       /* function handle for cal sad of 8x8 block (X3) */
+    pixel10_cmp_x4_t  sad10_8x8_x4;       /* function handle for cal sad of 8x8 block (X4) */
+} vpp10_me_t;
 
 /* ---------------------------------------------------------------------------
  * frame buffer manager
diff --git a/source/encoder/xavs2.c b/source/encoder/xavs2.c
index b2e6d54..631b215 100644
--- a/source/encoder/xavs2.c
+++ b/source/encoder/xavs2.c
@@ -161,7 +161,7 @@ xavs2_param_t *xavs2_encoder_opt_alloc(void)
     param->enable_alf                 = TRUE;
     param->alf_LowLatencyEncoding     = FALSE;
     param->enable_pmvr                = TRUE;
-    param->b_cross_slice_loop_filter  = FALSE;    // Ó°ÏìÖ¡¼¶²¢ÐÐ±à½âÂëµÄËÙ¶È£¬Ä¬ÈÏ½ûÓÃ
+    param->b_cross_slice_loop_filter  = FALSE;    // å½±å“å¸§çº§å¹¶è¡Œç¼–è§£ç çš„é€Ÿåº¦ï¼Œé»˜è®¤ç¦ç”¨
     param->enable_dmh                 = TRUE;
     param->b_fast_2lelvel_tu          = FALSE;
 
@@ -280,7 +280,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
     /* check parameters */
     if (encoder_check_parameters(param) < 0) {
         xavs2_log(NULL, XAVS2_LOG_ERROR, "error encoder parameters\n");
-        goto fail;
+        goto fail8;
     }
 
     size_ratecontrol = xavs2_rc_get_buffer_size(param);      /* rate control */
@@ -294,7 +294,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
                CACHE_LINE_SIZE * (XAVS2_INPUT_NUM + 4);
 
     /* alloc memory for the encoder wrapper */
-    CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size);
+    CHECKED_MALLOC8(mem_ptr, uint8_t *, mem_size);
 
     /* M0: assign the wrapper */
     h_mgr = (xavs2_handler_t *)mem_ptr;
@@ -333,12 +333,12 @@ void *xavs2_encoder_create(xavs2_param_t *param)
 #endif
 
     if (xavs2_thread_mutex_init(&h_mgr->mutex, NULL)) {
-        goto fail;
+        goto fail8;
     }
 
     for (i = 0; i < SIG_COUNT; i++) {
         if (xavs2_thread_cond_init(&h_mgr->cond[i], NULL)) {
-            goto fail;
+            goto fail8;
         }
     }
 
@@ -359,7 +359,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
         /* create the thread pool */
         if (xavs2_threadpool_init(&h_mgr->threadpool_rdo, thread_num, NULL, NULL)) {
             xavs2_log(h_mgr, XAVS2_LOG_ERROR, "Error init thread pool RDO. %d", thread_num);
-            goto fail;
+            goto fail8;
         }
         h_mgr->num_pool_threads = thread_num;
     }
@@ -374,7 +374,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
     if (xl_init(&h_mgr->list_frames_free)  != 0 ||
         xl_init(&h_mgr->list_frames_output) != 0 ||
         xl_init(&h_mgr->list_frames_ready) != 0) {
-        goto fail;
+        goto fail8;
     }
 
     /* init rate-control buffer */
@@ -385,7 +385,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
 
     if (xavs2_rc_init(h_mgr->rate_control, param) < 0) {
         xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create rate control fail\n");
-        goto fail;
+        goto fail8;
 
     }
 
@@ -397,20 +397,20 @@ void *xavs2_encoder_create(xavs2_param_t *param)
 
         if (tdrdo_init(h_mgr->td_rdo, param) != 0) {
             xavs2_log(h_mgr, XAVS2_LOG_ERROR, "init td-rdo fail\n");
-            goto fail;
+            goto fail8;
         }
     }
 
     /* create an encoder handler */
     h_mgr->p_coder = encoder_open(param, h_mgr);
     if (h_mgr->p_coder == NULL) {
-        goto fail;
+        goto fail8;
     }
 
     /* create encoder handlers for multi-thread */
     if (h_mgr->i_frm_threads > 1 || h_mgr->i_row_threads > 1) {
         if (encoder_contexts_init(h_mgr->p_coder, h_mgr) < 0) {
-            goto fail;
+            goto fail8;
         }
     }
 
@@ -422,7 +422,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
         if (frm) {
             xl_append(&h_mgr->list_frames_free, frm);
         } else {
-            goto fail;
+            goto fail8;
         }
     }
 
@@ -433,7 +433,7 @@ void *xavs2_encoder_create(xavs2_param_t *param)
     /* memory check */
     if ((uintptr_t)(h_mgr) + mem_size < (uintptr_t)mem_ptr) {
         xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to create input frame buffer.\n");
-        goto fail;
+        goto fail8;
     }
 
     /* init lookahead in the encoder wrapper */
@@ -448,12 +448,12 @@ void *xavs2_encoder_create(xavs2_param_t *param)
     /* create wrapper thread */
     if (xavs2_create_thread(&h_mgr->thread_wrapper, proc_wrapper_thread, h_mgr)) {
         xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create encoding thread\n");
-        goto fail;
+        goto fail8;
     }
 
     return h_mgr;
 
-fail:
+fail8:
     if (mem_ptr && h_mgr) {
         xavs2_encoder_destroy(h_mgr);
     }
@@ -532,7 +532,7 @@ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic)
 
     /* set properties */
     pic->img.in_sample_size  = param->input_sample_bit_depth == 8 ? 1 : 2;
-    pic->img.enc_sample_size = sizeof(pel_t);
+    pic->img.enc_sample_size = param->input_sample_bit_depth == 8 ? sizeof(pel8_t) : sizeof(pel10_t);
     pic->img.i_width[0]      = param->org_width;
     pic->img.i_width[1]      = param->org_width >> 1;
     pic->img.i_width[2]      = param->org_width >> 1;
@@ -541,12 +541,12 @@ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic)
     pic->img.i_lines[2]      = param->org_height >> (param->chroma_format <= CHROMA_420 ? 1 : 0);
     pic->img.i_csp           = XAVS2_CSP_I420;
     pic->img.i_plane         = frame->i_plane;
-    pic->img.i_stride[0]     = frame->i_stride[0] * sizeof(pel_t);
-    pic->img.i_stride[1]     = frame->i_stride[1] * sizeof(pel_t);
-    pic->img.i_stride[2]     = frame->i_stride[2] * sizeof(pel_t);
-    pic->img.img_planes[0]   = (uint8_t *)frame->planes[0];
-    pic->img.img_planes[1]   = (uint8_t *)frame->planes[1];
-    pic->img.img_planes[2]   = (uint8_t *)frame->planes[2];
+    pic->img.i_stride[0]     = param->input_sample_bit_depth == 8 ? frame->i_stride[0] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ;
+    pic->img.i_stride[1]     = param->input_sample_bit_depth == 8 ? frame->i_stride[1] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ;
+    pic->img.i_stride[2]     = param->input_sample_bit_depth == 8 ? frame->i_stride[2] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ;
+    pic->img.img_planes[0]   = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[0] : (uint8_t *)frame->planes10[0];
+    pic->img.img_planes[1]   = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[1] : (uint8_t *)frame->planes10[0];
+    pic->img.img_planes[2]   = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[2] : (uint8_t *)frame->planes10[0];
     pic->priv                = frame;   /* keep trace of this frame */
 
     return 0;
diff --git a/source/encoder/xavs2_api.c b/source/encoder/xavs2_api.c
index 43fa6b2..7bd28f7 100644
--- a/source/encoder/xavs2_api.c
+++ b/source/encoder/xavs2_api.c
@@ -57,7 +57,24 @@
 static xavs2_api_t api_default = {
     XVERSION_STR,
     VER_MAJOR * 10 + VER_MINOR,
-    BIT_DEPTH,
+    8,
+    xavs2_encoder_opt_help,
+    xavs2_encoder_opt_alloc,
+    xavs2_encoder_opt_set,
+    xavs2_encoder_opt_set2,
+    xavs2_encoder_opt_get,
+    xavs2_encoder_opt_destroy,
+    xavs2_encoder_get_buffer,
+    xavs2_encoder_create,
+    xavs2_encoder_destroy,
+    xavs2_encoder_encode,
+    xavs2_encoder_packet_unref,
+};
+
+static xavs2_api_t api_default10 = {
+    XVERSION_STR,
+    VER_MAJOR * 10 + VER_MINOR,
+    10,
     xavs2_encoder_opt_help,
     xavs2_encoder_opt_alloc,
     xavs2_encoder_opt_set,
@@ -78,7 +95,7 @@ typedef const xavs2_api_t *(*xavs2_api_get_t)(int bit_depth);
 static
 const xavs2_api_t *xavs2_load_new_module(const char *dll_path, const char *methofd_name, int bit_depth)
 {
-    /* TODO: ÔÚÊ¹ÓÃ´íÎóµÄ¿âÊ±, »á³öÏÖµÝ¹éµ÷ÓÃ´Ëº¯Êý×îÖÕµ¼ÖÂ±ÀÀ£ */
+    /* TODO: åœ¨ä½¿ç”¨é”™è¯¯çš„åº“æ—¶, ä¼šå‡ºçŽ°é€’å½’è°ƒç”¨æ­¤å‡½æ•°æœ€ç»ˆå¯¼è‡´å´©æºƒ */
 #if _WIN32
     HMODULE h = LoadLibraryA(dll_path);
     if (h) {
@@ -116,10 +133,12 @@ xavs2_api_get(int bit_depth)
     const char* method_name = "xavs2_api_get";
 
     switch (bit_depth) {
-    case BIT_DEPTH:
+    case 8:
         return &api_default;
+    case 10:
+        return &api_default10;
     default:
-        sprintf(s_lib_name, "libxavs2-%d-%dbit.%s", VER_MAJOR * 10 + VER_MINOR, bit_depth, ext_dyn_lib);
+        sprintf(s_lib_name, "libxavs2-%d-%dbit", VER_MAJOR * 10 + VER_MINOR, bit_depth);
         return xavs2_load_new_module(s_lib_name, method_name, bit_depth);
     }
 }
diff --git a/source/encoder/xlist.c b/source/encoder/xlist.c
index d26c1dc..5912f6b 100644
--- a/source/encoder/xlist.c
+++ b/source/encoder/xlist.c
@@ -37,11 +37,6 @@
 #include "common.h"
 #include "xlist.h"
 
-#if !defined(_MSC_VER)
-#include <errno.h>
-#include <pthread.h>
-#endif
-
 /**
  * ===========================================================================
  * xlist
diff --git a/source/encoder/yuv_writer.c b/source/encoder/yuv_writer.c
index 1096d8b..9d47f5b 100644
--- a/source/encoder/yuv_writer.c
+++ b/source/encoder/yuv_writer.c
@@ -46,20 +46,36 @@ void dump_yuv_out(xavs2_t *h, FILE *fp, xavs2_frame_t *frame, int img_w, int img
     int j;
 
     if (fp != NULL) {
-        UNUSED_PARAMETER(h);
+        //UNUSED_PARAMETER(h);
+        if (h->param->input_sample_bit_depth == 8) {
         for (j = 0; j < img_h; j++) {
-            fwrite(frame->planes[0] + j * frame->i_stride[0], img_w, 1, fp);
+            fwrite(frame->planes8[0] + j * frame->i_stride[0], img_w, 1, fp);
         }
 
         if (frame->i_plane == 3) {
             for (j = 0; j < (img_h >> 1); j++) {
-                fwrite(frame->planes[1] + j * frame->i_stride[1], img_w >> 1, 1, fp);
+                fwrite(frame->planes8[1] + j * frame->i_stride[1], img_w >> 1, 1, fp);
             }
 
             for (j = 0; j < (img_h >> 1); j++) {
-                fwrite(frame->planes[2] + j * frame->i_stride[2], img_w >> 1, 1, fp);
+                fwrite(frame->planes8[2] + j * frame->i_stride[2], img_w >> 1, 1, fp);
             }
         }
+        } else {
+        for (j = 0; j < img_h; j++) {
+            fwrite(frame->planes10[0] + j * frame->i_stride[0], img_w, 1, fp);
+        }
+
+        if (frame->i_plane == 3) {
+            for (j = 0; j < (img_h >> 1); j++) {
+                fwrite(frame->planes10[1] + j * frame->i_stride[1], img_w >> 1, 1, fp);
+            }
+
+            for (j = 0; j < (img_h >> 1); j++) {
+                fwrite(frame->planes10[2] + j * frame->i_stride[2], img_w >> 1, 1, fp);
+            }
+        }
+        }
 
     }
 }