diff --git a/source/common/avs2_defs.h b/source/common/avs2_defs.h index 7a26160..1926068 100644 --- a/source/common/avs2_defs.h +++ b/source/common/avs2_defs.h @@ -256,7 +256,7 @@ enum sao_class_e { #define XAVS2_MIN3(a, b, c) XAVS2_MIN((a), XAVS2_MIN((b),(c))) #define XAVS2_MAX3(a, b, c) XAVS2_MAX((a), XAVS2_MAX((b),(c))) -#define XAVS2_CLIP1(a) ((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a))) +//#define XAVS2_CLIP1(a) ((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a))) #define XAVS2_CLIP3F(L, H, v) (((v) < (L)) ? (L) : (((v) > (H)) ? (H) : (v))) #define XAVS2_CLIP3(L, H, v) xavs2_clip3(L, H, v) #define XAVS2_ABS(A) ((A) < 0 ? (-(A)) : (A)) // abs macro, faster than procedure @@ -281,9 +281,9 @@ static void XAVS2_SWAP_PTR(T *&x, T *&y) * global variables * =========================================================================== */ -static const int g_bit_depth = BIT_DEPTH; -static const int max_pel_value = (1 << BIT_DEPTH) - 1; -static const int g_dc_value = (1 << BIT_DEPTH) >> 1; +//static const int g_bit_depth = BIT_DEPTH; +//static const int max_pel_value = (1 << BIT_DEPTH) - 1; +//static const int g_dc_value = (1 << BIT_DEPTH) >> 1; /** * =========================================================================== @@ -291,10 +291,10 @@ static const int g_dc_value = (1 << BIT_DEPTH) >> 1; * =========================================================================== */ -static ALWAYS_INLINE pel_t xavs2_clip_pixel(int x) +/*static ALWAYS_INLINE pel_t xavs2_clip_pixel(int x) { return (pel_t)((x & ~max_pel_value) ? (-x) >> 31 & max_pel_value : x); -} +}*/ static ALWAYS_INLINE int xavs2_clip3(int i_min, int i_max, int v) { @@ -323,19 +323,19 @@ static ALWAYS_INLINE int xavs2_median(int a, int b, int c) return b; } -// 返回数值的符号位,负数返回-1,否则返回1 +// 杩斿洖鏁板肩殑绗﹀彿浣嶏紝璐熸暟杩斿洖-1锛屽惁鍒欒繑鍥1 static ALWAYS_INLINE int xavs2_sign2(int val) { return ((val >> 31) << 1) + 1; } -// 返回数值的符号位,负数返回-1,0值返回0,正数返回1 +// 杩斿洖鏁板肩殑绗﹀彿浣嶏紝璐熸暟杩斿洖-1锛0鍊艰繑鍥0锛屾鏁拌繑鍥1 static ALWAYS_INLINE int xavs2_sign3(int val) { return (val >> 31) | (int)(((uint32_t)-val) >> 31u); } -// 计算正整数的log2值,0和1时返回0,其他返回log2(val) +// 璁$畻姝f暣鏁扮殑log2鍊硷紝0鍜1鏃惰繑鍥0锛屽叾浠栬繑鍥瀕og2(val) #define xavs2_log2u(val) xavs2_ctz(val) diff --git a/source/common/basic_types.h b/source/common/basic_types.h index 59d9127..a738055 100644 --- a/source/common/basic_types.h +++ b/source/common/basic_types.h @@ -47,11 +47,16 @@ * basic types * =========================================================================== */ -typedef uint8_t pel_t; /* type for pixel */ -typedef int16_t itr_t; /* intra prediction temp */ + +typedef uint16_t pel10_t; /* type for pixel value */ +typedef uint64_t pixel10_4; /* type for 4-pixels value */ +typedef int32_t itr10_t; /* intra prediction temp */ +typedef uint8_t pel8_t; /* type for pixel value */ +typedef uint32_t pixel8_4; /* type for 4-pixels value */ +typedef int16_t itr8_t; /* intra prediction temp */ + typedef uint16_t sum_t; typedef uint32_t sum2_t; -typedef uint32_t pixel4; typedef int32_t ssum2_t; /* Signed sum */ typedef int32_t dist_t; diff --git a/source/common/common.h b/source/common/common.h index ab91348..14462d1 100644 --- a/source/common/common.h +++ b/source/common/common.h @@ -106,18 +106,26 @@ /* --------------------------------------------------------------------------- * memory malloc */ -#define CHECKED_MALLOC(var, type, size) \ +#define CHECKED_MALLOC8(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ (var) = (type)xavs2_malloc(size);\ if ((var) == NULL) {\ - goto fail;\ + goto fail8;\ + }\ + MULTI_LINE_MACRO_END + +#define CHECKED_MALLOC10(var, type, size) \ + MULTI_LINE_MACRO_BEGIN\ + (var) = (type)xavs2_malloc(size);\ + if ((var) == NULL) {\ + goto fail10;\ }\ MULTI_LINE_MACRO_END #define CHECKED_MALLOCZERO(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ size_t new_size = ((size + 31) >> 5) << 5; /* align the size to 32 bytes */ \ - CHECKED_MALLOC(var, type, new_size);\ + CHECKED_MALLOC8(var, type, new_size);\ g_funcs.memzero_aligned(var, new_size); \ MULTI_LINE_MACRO_END @@ -392,7 +400,7 @@ typedef union runlevel_pair_t { /* --------------------------------------------------------------------------- * run-level infos (CG: Coefficient Group) - * 熵编码过程中最大的变换块为 32x32,最多 8*8 个CG + * 鐔电紪鐮佽繃绋嬩腑鏈澶х殑鍙樻崲鍧椾负 32x32锛屾渶澶 8*8 涓狢G */ typedef struct runlevel_t { ALIGN16(runlevel_pair_t runlevels_cg[16]); @@ -411,7 +419,7 @@ typedef struct runlevel_t { * binary_t */ typedef struct binary_t { - /* 语法元素编码用函数指针 */ + /* 璇硶鍏冪礌缂栫爜鐢ㄥ嚱鏁版寚閽 */ int (*write_intra_pred_mode)(aec_t *p_aec, int ipmode); int (*write_ctu_split_flag)(aec_t *p_aec, int i_cu_split, int i_cu_level); int (*est_cu_header)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu); @@ -464,14 +472,14 @@ typedef struct binary_t { #define NUM_LAST_CG_CTX_CHROMA 6 #define NUM_SIGN_CG_CTX_LUMA 2 #define NUM_SIGN_CG_CTX_CHROMA 1 -#define NUM_LAST_POS_CTX_LUMA 48 /* last_coeff_pos_x 和 last_coeff_pos_y 共计有48个色度分量上下文 */ -#define NUM_LAST_POS_CTX_CHROMA 12 /* last_coeff_pos_x 和 last_coeff_pos_y 共计有12个色度分量上下文 */ +#define NUM_LAST_POS_CTX_LUMA 48 /* last_coeff_pos_x 鍜 last_coeff_pos_y 鍏辫鏈48涓壊搴﹀垎閲忎笂涓嬫枃 */ +#define NUM_LAST_POS_CTX_CHROMA 12 /* last_coeff_pos_x 鍜 last_coeff_pos_y 鍏辫鏈12涓壊搴﹀垎閲忎笂涓嬫枃 */ #define NUM_MAP_CTX 12 #define NUM_LAST_CG_CTX (NUM_LAST_CG_CTX_LUMA + NUM_LAST_CG_CTX_CHROMA) /* last_cg_pos:6; + last_cg0_flag:2(IsChroma); last_cg_x:2; last_cg_y:2 */ #define NUM_SIGN_CG_CTX (NUM_SIGN_CG_CTX_LUMA + NUM_SIGN_CG_CTX_CHROMA) #define NUM_LAST_POS_CTX (NUM_LAST_POS_CTX_LUMA + NUM_LAST_POS_CTX_CHROMA) /* last_coeff_pos_x: (30) + last_coeff_pos_y: (30) */ -#define NUM_COEFF_LEVEL_CTX 40 /* CoeffLevelMinus1Band 为 0 时 coeff_level_minus1_pos_in_band */ +#define NUM_COEFF_LEVEL_CTX 40 /* CoeffLevelMinus1Band 涓 0 鏃 coeff_level_minus1_pos_in_band */ #define NUM_SAO_MERGE_FLAG_CTX 3 #define NUM_SAO_MODE_CTX 1 @@ -682,27 +690,27 @@ typedef struct ctx_set_t { context_t pu_reference_index [NUM_REF_NO_CTX ]; context_t cbp_contexts [NUM_CBP_CTX ]; context_t mvd_contexts [2][NUM_MVD_CTX ]; - /* 帧间预测 */ + /* 甯ч棿棰勬祴 */ context_t pu_type_index [NUM_INTER_DIR_CTX ]; // b_pu_type_index[15] = f_pu_type_index[3] + dir_multi_hypothesis_mode[12] context_t b_pu_type_min_index [NUM_INTER_DIR_MIN_CTX ]; // b_pu_type_index2 // for B_NxN // f_pu_type_index2 // for F_NxN - context_t cu_subtype_index [DS_MAX_NUM ]; // B_Skip/B_Direct, F_Skip/F_Direct 公用 + context_t cu_subtype_index [DS_MAX_NUM ]; // B_Skip/B_Direct, F_Skip/F_Direct 鍏敤 context_t weighted_skip_mode [WPM_NUM ]; - /* 帧内预测 */ + /* 甯у唴棰勬祴 */ context_t intra_luma_pred_mode [NUM_INTRA_MODE_CTX ]; context_t intra_chroma_pred_mode [NUM_INTRA_MODE_C_CTX ]; - /* CU 级别QP调整 */ + /* CU 绾у埆QP璋冩暣 */ #if ENABLE_RATE_CONTROL_CU context_t delta_qp_contexts [NUM_DELTA_QP_CTX ]; #endif - /* 变换系数编码 */ + /* 鍙樻崲绯绘暟缂栫爜 */ context_t coeff_run [2][NUM_BLOCK_TYPES][NUM_MAP_CTX ]; // [0:Luma, 1:Chroma][rank][ctx_idx] context_t nonzero_cg_flag [NUM_SIGN_CG_CTX ]; context_t last_cg_contexts [NUM_LAST_CG_CTX ]; context_t last_pos_contexts [NUM_LAST_POS_CTX ]; context_t coeff_level [NUM_COEFF_LEVEL_CTX ]; - /* 后处理模块 */ + /* 鍚庡鐞嗘ā鍧 */ context_t sao_merge_type_index [NUM_SAO_MERGE_FLAG_CTX]; context_t sao_mode [NUM_SAO_MODE_CTX ]; context_t sao_interval_offset_abs [NUM_SAO_OFFSET_CTX ]; @@ -749,7 +757,8 @@ typedef struct slice_t { uint8_t *p_slice_bs_buf; /* pointer of bitstream buffer (start address) */ /* slice buffers */ - pel_t *slice_intra_border[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ + pel8_t *slice_intra_border8[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ + pel10_t *slice_intra_border10[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ uint8_t *slice_deblock_flag[2]; /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */ int8_t *slice_ipredmode; /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */ @@ -808,7 +817,8 @@ struct cu_info_t { int i_scu_x; /* horizontal position for the first SCU in CU */ int i_scu_y; /* vertical position for the first SCU in CU */ - pel_t *p_rec[3]; /* reconstruction pixels for current cu [y/u/v] */ + pel8_t *p_rec8[3]; /* reconstruction pixels for current cu [y/u/v] */ + pel10_t *p_rec10[3]; /* reconstruction pixels for current cu [y/u/v] */ coeff_t *p_coeff[3]; /* residual coefficient for current cu [y/u/v] */ int8_t i_level; /* cu level, 3: 8x8, 4: 16x16, 5: 32x32, 6: 64x64 */ @@ -861,13 +871,13 @@ struct cu_info_t { * cu_mv_mode_t */ typedef struct cu_mv_mode_t { - mv_t all_sym_mv[1]; /* 对称模式的MV */ + mv_t all_sym_mv[1]; /* 瀵圭О妯″紡鐨凪V */ mv_t all_single_mv[MAX_REFS]; - /* mvp可以只对整个LCU只保留一份,无须按照深度分层 */ + /* mvp鍙互鍙鏁翠釜LCU鍙繚鐣欎竴浠斤紝鏃犻』鎸夌収娣卞害鍒嗗眰 */ mv_t all_mvp[MAX_REFS]; /* 1st MVP of dual hypothesis prediction mode, or Foreword of BiPrediction */ - /* 双向MV也只需要保留一份 */ + /* 鍙屽悜MV涔熷彧闇瑕佷繚鐣欎竴浠 */ mv_t all_dual_mv_1st[MAX_REFS]; mv_t all_dual_mv_2nd[MAX_REFS]; } cu_mv_mode_t; @@ -885,10 +895,10 @@ typedef struct cu_mc_param_t { * cu_mode_t */ typedef struct cu_mode_t { - uint8_t mv_padding1[16]; /* 避免越界,至少需2字节,此处为对齐补到16字节 */ + uint8_t mv_padding1[16]; /* 閬垮厤瓒婄晫锛岃嚦灏戦渶2瀛楄妭锛屾澶勪负瀵归綈琛ュ埌16瀛楄妭 */ cu_mv_mode_t mvs[MAX_INTER_MODES][4]; /* MVs for normal inter prediction */ cu_mc_param_t best_mc; /* MVs to store */ - cu_mc_param_t best_mc_tmp; /* 用于算法 OPT_ROUGH_PU_SEL 保存多个帧间划分模式的最佳参数(不一定是全局最优) */ + cu_mc_param_t best_mc_tmp; /* 鐢ㄤ簬绠楁硶 OPT_ROUGH_PU_SEL 淇濆瓨澶氫釜甯ч棿鍒掑垎妯″紡鐨勬渶浣冲弬鏁帮紙涓嶄竴瀹氭槸鍏ㄥ眬鏈浼橈級 */ int8_t ref_idx_single[4]; /* [block], preserved for DMH */ @@ -923,7 +933,7 @@ typedef struct cu_feature_t { * 2: only try current depth * --------------------------- */ int pred_split_type; /* prediction of cu split type: 0: un-determined; 1: split; 2: not-split */ - rdcost_t pred_costs[MAX_PRED_MODES]; /* 每种PU划分模式的 cost (基于预分析等获取) */ + rdcost_t pred_costs[MAX_PRED_MODES]; /* 姣忕PU鍒掑垎妯″紡鐨 cost 锛堝熀浜庨鍒嗘瀽绛夎幏鍙栵級 */ } cu_feature_t; @@ -1046,10 +1056,12 @@ struct xavs2_frame_t { int i_stride[3]; /* stride for Y/U/V */ int i_width[3]; /* width for Y/U/V */ int i_lines[3]; /* height for Y/U/V */ - pel_t *planes[3]; /* pointers to Y/U/V data buffer */ - pel_t *filtered[16]; /* pointers to interpolated luma data buffers */ - - pel_t *plane_buf; + pel10_t *planes10[3]; /* pointers to Y/U/V data buffer */ + pel10_t *filtered10[16]; /* pointers to interpolated luma data buffers */ + pel10_t *plane_buf10; + pel8_t *planes8[3]; /* pointers to Y/U/V data buffer */ + pel8_t *filtered8[16]; /* pointers to interpolated luma data buffers */ + pel8_t *plane_buf8; int size_plane_buf; /* bit stream buffer */ @@ -1106,7 +1118,8 @@ typedef struct xavs2_me_t { bool_t b_search_dmh; /* is searching for DMH mode */ /* pointers */ - pel_t *p_fenc; /* pointer to the current PU block in source CTU */ + pel8_t *p_fenc8; /* pointer to the current PU block in source CTU */ + pel10_t *p_fenc10; /* pointer to the current PU block in source CTU */ xavs2_frame_t *p_fref_1st; /* pointer to the current (1st) reference frame */ xavs2_frame_t *p_fref_2nd; /* pointer to the current 2nd reference frame */ @@ -1334,7 +1347,8 @@ typedef struct cu_layer_t { rdcost_t mode_rdcost[MAX_PRED_MODES]; /* min rd-cost for each mode */ int mask_md_res_pred; /* available mode mask */ - pel_t *p_rec_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ + pel8_t *p_rec8_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ + pel10_t *p_rec10_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ coeff_t *p_coeff_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ cu_info_t cu_best; /* best info for each cu depth */ @@ -1355,16 +1369,21 @@ typedef struct cu_layer_t { #define FDEC_BUF_SIZE (FDEC_STRIDE * (MAX_CU_SIZE + MAX_CU_SIZE / 2)) #define LCU_BUF_SIZE (MAX_CU_SIZE * MAX_CU_SIZE) - ALIGN32(pel_t rec_buf_y [3][LCU_BUF_SIZE]); /* luma reconstruction buffer [cur/tmp/best][] */ + ALIGN32(pel8_t rec8_buf_y [3][LCU_BUF_SIZE]); /* luma reconstruction buffer [cur/tmp/best][] */ + ALIGN32(pel10_t rec10_buf_y [3][LCU_BUF_SIZE]); /* luma reconstruction buffer [cur/tmp/best][] */ ALIGN32(coeff_t coef_buf_y [3][LCU_BUF_SIZE]); /* luma coefficient buffer [cur/tmp/best][] */ - ALIGN32(pel_t rec_buf_uv [2][3][LCU_BUF_SIZE >> 2]); /* chroma reconstruction buffer [uv][cur/tmp/best][] */ + ALIGN32(pel8_t rec8_buf_uv [2][3][LCU_BUF_SIZE >> 2]); /* chroma reconstruction buffer [uv][cur/tmp/best][] */ + ALIGN32(pel10_t rec10_buf_uv [2][3][LCU_BUF_SIZE >> 2]); /* chroma reconstruction buffer [uv][cur/tmp/best][] */ ALIGN32(coeff_t coef_buf_uv[2][3][LCU_BUF_SIZE >> 2]); /* chroma coefficient buffer [uv][cur/tmp/best][] */ /* inter prediction buffer */ - ALIGN32(pel_t buf_pred_inter_luma[2][LCU_BUF_SIZE]); /* temporary decoding buffer for inter prediction (luma) */ + ALIGN32(pel8_t buf_pred_inter_luma8[2][LCU_BUF_SIZE]); /* temporary decoding buffer for inter prediction (luma) */ + ALIGN32(pel10_t buf_pred_inter_luma10[2][LCU_BUF_SIZE]); /* temporary decoding buffer for inter prediction (luma) */ /* Ping-pong buffer for inter prediction */ - pel_t *buf_pred_inter; /* current inter prediction buffer */ - pel_t *buf_pred_inter_best; /* backup of best inter prediction */ + pel8_t *buf_pred_inter8; /* current inter prediction buffer */ + pel10_t *buf_pred_inter10; /* current inter prediction buffer */ + pel8_t *buf_pred_inter8_best; /* backup of best inter prediction */ + pel10_t *buf_pred_inter10_best; /* backup of best inter prediction */ } cu_layer_t; /* --------------------------------------------------------------------------- @@ -1376,13 +1395,18 @@ typedef struct cu_parallel_t { ALIGN32(coeff_t coeff_bak[LCU_BUF_SIZE]); /* buffers used for inter prediction */ - ALIGN32(pel_t buf_pred_inter_c[LCU_BUF_SIZE >> 1]); /* temporary decoding buffer for inter prediction (chroma) */ - ALIGN32(pel_t buf_pixel_temp [LCU_BUF_SIZE]); /* temporary pixel buffer, used for bi/dual-prediction */ + ALIGN32(pel8_t buf_pred_inter8_c[LCU_BUF_SIZE >> 1]); /* temporary decoding buffer for inter prediction (chroma) */ + ALIGN32(pel10_t buf_pred_inter10_c[LCU_BUF_SIZE >> 1]); /* temporary decoding buffer for inter prediction (chroma) */ + ALIGN32(pel8_t buf_pixel_temp8 [LCU_BUF_SIZE]); /* temporary pixel buffer, used for bi/dual-prediction */ + ALIGN32(pel10_t buf_pixel_temp10 [LCU_BUF_SIZE]); /* temporary pixel buffer, used for bi/dual-prediction */ /* predication buffers for all intra modes */ - ALIGN32(pel_t intra_pred [NUM_INTRA_MODE ][LCU_BUF_SIZE]); /* for all 33 luma prediction modes */ - ALIGN32(pel_t intra_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]); /* for all chroma intra prediction modes */ - ALIGN32(pel_t buf_edge_pixels[MAX_CU_SIZE << 3]); /* reference pixels for intra luma/chroma prediction */ + ALIGN32(pel8_t intra8_pred [NUM_INTRA_MODE ][LCU_BUF_SIZE]); /* for all 33 luma prediction modes */ + ALIGN32(pel10_t intra10_pred [NUM_INTRA_MODE ][LCU_BUF_SIZE]); /* for all 33 luma prediction modes */ + ALIGN32(pel8_t intra8_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]); /* for all chroma intra prediction modes */ + ALIGN32(pel10_t intra10_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]); /* for all chroma intra prediction modes */ + ALIGN32(pel8_t buf_edge_pixels8[MAX_CU_SIZE << 3]); /* reference pixels for intra luma/chroma prediction */ + ALIGN32(pel10_t buf_edge_pixels10[MAX_CU_SIZE << 3]); /* reference pixels for intra luma/chroma prediction */ runlevel_t runlevel; /* run level buffer for RDO */ @@ -1408,7 +1432,7 @@ struct xavs2_t { ALIGN32(xavs2_log_t module_log); /* log module */ /* === BEGIN =================================================== * communal variables - * 序列级(编码的所有帧)共享变量区域开始 + * 搴忓垪绾э紙缂栫爜鐨勬墍鏈夊抚锛夊叡浜彉閲忓尯鍩熷紑濮 */ ALIGN32(SYNC_VARS_1(communal_vars_1)); @@ -1436,7 +1460,7 @@ struct xavs2_t { bool_t b_progressive; bool_t b_field_sequence; bool_t use_fractional_me; /* whether use fractional Motion Estimation - * 0: 关闭分像素搜索;1: 开启1/2分像素搜索;2:开启1/4分像素搜索 + * 0: 鍏抽棴鍒嗗儚绱犳悳绱紱1: 寮鍚1/2鍒嗗儚绱犳悳绱紱2:寮鍚1/4鍒嗗儚绱犳悳绱 */ bool_t use_fast_sub_me; /* whether use fast quarter Motion Estimation: skip half fractional search point (from futl) */ bool_t UMH_big_hex_level; /* whether skip big hex pattern when using UMH (from futl) @@ -1467,8 +1491,11 @@ struct xavs2_t { int min_mv_range[2]; /* mv range (min) decided by the level id */ int max_mv_range[2]; /* mv range (max) decided by the level id */ /* function pointers */ - int (*get_intra_candidates_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, + int (*get_intra_candidates_luma8)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + int (*get_intra_candidates_luma10)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); int (*get_intra_candidates_chroma)(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list); @@ -1477,8 +1504,8 @@ struct xavs2_t { uint8_t *tab_avail_TR; /* pointers to array of available table, Top Right */ uint8_t *tab_avail_DL; /* pointers to array of available table, Down Left */ uint8_t tab_num_intra_rdo[MAX_CU_SIZE_IN_BIT + 1]; /* pointers to array of table, indicate numbers of intra prediction modes for RDO */ - int8_t num_intra_rmd_dist2; /* 距离2的角度的搜索数量 */ - int8_t num_intra_rmd_dist1; /* 距离1的角度的搜索数量 */ + int8_t num_intra_rmd_dist2; /* 璺濈2鐨勮搴︾殑鎼滅储鏁伴噺 */ + int8_t num_intra_rmd_dist1; /* 璺濈1鐨勮搴︾殑鎼滅储鏁伴噺 */ int8_t num_rdo_intra_chroma; /* number of RDO modes for intra chroma prediction */ SYNC_VARS_2(communal_vars_2); @@ -1486,7 +1513,7 @@ struct xavs2_t { /* === BEGIN =================================================== * row-dependent variables : values below need to be synchronized between rows - * 帧级共享变量区域开始,每帧的多个行级线程之间访问相同内容 + * 甯х骇鍏变韩鍙橀噺鍖哄煙寮濮嬶紝姣忓抚鐨勫涓绾х嚎绋嬩箣闂磋闂浉鍚屽唴瀹 */ SYNC_VARS_1(row_vars_1); @@ -1517,12 +1544,13 @@ struct xavs2_t { slice_t *slices[MAX_SLICES]; /* all slices */ int i_slice_index; /* slice index for the current thread */ - /* 不同Slice不同的buffer */ - pel_t *intra_border[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ + /* 涓嶅悓Slice涓嶅悓鐨刡uffer */ + pel8_t *intra_border8[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ + pel10_t *intra_border10[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ uint8_t *p_deblock_flag[2]; /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */ int8_t *ipredmode; /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */ - /* 帧级唯一的buffer */ + /* 甯х骇鍞竴鐨刡uffer */ int8_t *lcu_slice_idx; /* [i_height_in_lcu][i_width_in_lcu] */ int8_t *dir_pred; /* [i_height_in_minpu][i_width_in_minpu], inter prediction direction */ int8_t *fwd_1st_ref; /* [i_height_in_minpu][i_width_in_minpu] */ @@ -1537,7 +1565,7 @@ struct xavs2_t { double thres_qsfd_cu[2][CTU_DEPTH]; /* QSFD threshold for inter frame, [0:inter, 1:intra][log2_cu_size - 3] */ xavs2_frame_t *img_sao; /* reconstruction image for SAO */ - SAOStatData(*sao_stat_datas)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]; /* [lcu][comp][types], 可不用全局 */ + SAOStatData(*sao_stat_datas)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]; /* [lcu][comp][types], 鍙笉鐢ㄥ叏灞 */ SAOBlkParam(*sao_blk_params)[NUM_SAO_COMPONENTS]; /* [lcu][comp] */ int (*num_sao_lcu_off)[NUM_SAO_COMPONENTS]; /* [lcu_row][comp] */ bool_t slice_sao_on [NUM_SAO_COMPONENTS]; @@ -1594,8 +1622,11 @@ struct xavs2_t { bool_t b_2nd_rdcost_pass; /* 2nd pass for RDCost update */ /* function pointers for RDO */ - int (*get_intra_dir_for_rdo_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, + int (*get_intra_dir_for_rdo_luma8)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + int (*get_intra_dir_for_rdo_luma10)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); int (*get_skip_mvs)(xavs2_t *h, cu_t *p_cu); /* get MVs for skip/direct mode */ @@ -1606,8 +1637,10 @@ struct xavs2_t { cu_t *p_ctu; /* pointer to the top of current CTU */ /* 2, enc/dec/pred Y/U/V pointers */ - pel_t *p_fdec[3]; /* [Y/U/V] pointer over lcu of the frame to be reconstructed */ - pel_t *p_fenc[3]; /* [Y/U/V] pointer over lcu of the frame to be compressed */ + pel8_t *p_fdec8[3]; /* [Y/U/V] pointer over lcu of the frame to be reconstructed */ + pel10_t *p_fdec10[3]; /* [Y/U/V] pointer over lcu of the frame to be reconstructed */ + pel8_t *p_fenc8[3]; /* [Y/U/V] pointer over lcu of the frame to be compressed */ + pel10_t *p_fenc10[3]; /* [Y/U/V] pointer over lcu of the frame to be compressed */ coeff_t *lcu_coeff[3]; /* [Y/U/V] coefficients of LCU */ @@ -1619,15 +1652,22 @@ struct xavs2_t { #if PARALLEL_INSIDE_CTU cu_parallel_t cu_enc [CTU_DEPTH]; #else - cu_parallel_t cu_enc [1]; /* 无CTU内的多线程时,只需要一个 */ + cu_parallel_t cu_enc [1]; /* 鏃燙TU鍐呯殑澶氱嚎绋嬫椂锛屽彧闇瑕佷竴涓 */ #endif - ALIGN32(pel_t fenc_buf[FENC_BUF_SIZE]); /* encoding buffer (source Y/U/V buffer) */ - ALIGN32(pel_t fdec_buf[FDEC_BUF_SIZE]); /* decoding buffer (Reconstruction Y/U/V buffer) */ - struct lcu_intra_border_t { - ALIGN32(pel_t rec_left[MAX_CU_SIZE]); /* Left border of current LCU */ - ALIGN32(pel_t rec_top[MAX_CU_SIZE * 2 + 32]); /* top-left, top and top-right samples (Reconstruction) of current LCU */ - } ctu_border[IMG_CMPNTS]; /* Y, U, V components */ + ALIGN32(pel8_t fenc_buf8[FENC_BUF_SIZE]); /* encoding buffer (source Y/U/V buffer) */ + ALIGN32(pel8_t fdec_buf8[FDEC_BUF_SIZE]); /* decoding buffer (Reconstruction Y/U/V buffer) */ + ALIGN32(pel10_t fenc_buf10[FENC_BUF_SIZE]); /* encoding buffer (source Y/U/V buffer) */ + ALIGN32(pel10_t fdec_buf10[FDEC_BUF_SIZE]); /* decoding buffer (Reconstruction Y/U/V buffer) */ + struct lcu_intra_border8_t { + ALIGN32(pel8_t rec_left[MAX_CU_SIZE]); /* Left border of current LCU */ + ALIGN32(pel8_t rec_top[MAX_CU_SIZE * 2 + 32]); /* top-left, top and top-right samples (Reconstruction) of current LCU */ + } ctu_border8[IMG_CMPNTS]; /* Y, U, V components */ + + struct lcu_intra_border10_t { + ALIGN32(pel10_t rec_left[MAX_CU_SIZE]); /* Left border of current LCU */ + ALIGN32(pel10_t rec_top[MAX_CU_SIZE * 2 + 32]); /* top-left, top and top-right samples (Reconstruction) of current LCU */ + } ctu_border10[IMG_CMPNTS]; /* Y, U, V components */ /* buffer for the coding tree units */ ALIGN16(cu_t all_cu[85]); /* all cu: 1(64x64) + 4(32x32) + 16(16x16) + 64(8x8) = 85 */ @@ -1640,7 +1680,7 @@ struct xavs2_t { /* coding states in RDO, independent for each thread */ struct coding_states { - /* 只用于备份上下文状态,无需初始化 */ + /* 鍙敤浜庡浠戒笂涓嬫枃鐘舵侊紝鏃犻渶鍒濆鍖 */ aec_t cs_sao_start; aec_t cs_sao_best; aec_t cs_sao_temp; diff --git a/source/common/cudata.c b/source/common/cudata.c index 04e20f0..d84d534 100644 --- a/source/common/cudata.c +++ b/source/common/cudata.c @@ -192,13 +192,31 @@ void cu_get_mvds(xavs2_t *h, cu_t *p_cu) /* --------------------------------------------------------------------------- * copy one block (multi-planes) */ -static void block_copy_x3(pel_t *p_dst[], int i_dst[], pel_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes) +static void block_copy8_x3(pel8_t *p_dst[], int i_dst[], pel8_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes) { - pel_t *dst, *src; + pel8_t *dst, *src; int y, k; for (k = 0; k < i_planes; k++) { - int i_size = i_width[k] * sizeof(pel_t); + int i_size = i_width[k] * sizeof(pel8_t); + memcpy_t f_memcpy = i_size & 15 ? memcpy : g_funcs.memcpy_aligned; + dst = p_dst[k]; + src = p_src[k]; + for (y = i_height[k]; y != 0; y--) { + f_memcpy(dst, src, i_size); + dst += i_dst[k]; + src += i_src[k]; + } + } +} + +static void block_copy10_x3(pel10_t *p_dst[], int i_dst[], pel10_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes) +{ + pel10_t *dst, *src; + int y, k; + + for (k = 0; k < i_planes; k++) { + int i_size = i_width[k] * sizeof(pel10_t); memcpy_t f_memcpy = i_size & 15 ? memcpy : g_funcs.memcpy_aligned; dst = p_dst[k]; src = p_src[k]; @@ -213,7 +231,19 @@ static void block_copy_x3(pel_t *p_dst[], int i_dst[], pel_t *p_src[], int i_src /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void -xavs2_copy_col1(pel_t *dst, pel_t *src, const int height, const int stride) +xavs2_copy_col18(pel8_t *dst, pel8_t *src, const int height, const int stride) +{ + int i; + int k = 0; + + for (i = height; i != 0; i--) { + dst[k] = src[k]; + k += stride; + } +} + +static ALWAYS_INLINE void +xavs2_copy_col110(pel10_t *dst, pel10_t *src, const int height, const int stride) { int i; int k = 0; @@ -228,13 +258,28 @@ xavs2_copy_col1(pel_t *dst, pel_t *src, const int height, const int stride) * cache CTU border */ static INLINE -void xavs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top, - const pel_t *p_left, int i_left, +void xavs2_cache_lcu_border8(pel8_t *p_dst, const pel8_t *p_top, + const pel8_t *p_left, int i_left, + int lcu_width, int lcu_height) +{ + int i; + /* top, top-right */ + memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel8_t)); + /* left */ + for (i = 1; i <= lcu_height; i++) { + p_dst[-i] = p_left[0]; + p_left += i_left; + } +} + +static INLINE +void xavs2_cache_lcu_border10(pel10_t *p_dst, const pel10_t *p_top, + const pel10_t *p_left, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ - memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel_t)); + memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel10_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst[-i] = p_left[0]; @@ -246,14 +291,32 @@ void xavs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top, * cache CTU border (UV components together) */ static INLINE -void xavs2_cache_lcu_border_uv(pel_t *p_dst_u, const pel_t *p_top_u, const pel_t *p_left_u, - pel_t *p_dst_v, const pel_t *p_top_v, const pel_t *p_left_v, +void xavs2_cache_lcu_border8_uv(pel8_t *p_dst_u, const pel8_t *p_top_u, const pel8_t *p_left_u, + pel8_t *p_dst_v, const pel8_t *p_top_v, const pel8_t *p_left_v, + int i_left, int lcu_width, int lcu_height) +{ + int i; + /* top, top-right */ + memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel8_t)); + memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel8_t)); + /* left */ + for (i = 1; i <= lcu_height; i++) { + p_dst_u[-i] = p_left_u[0]; + p_dst_v[-i] = p_left_v[0]; + p_left_u += i_left; + p_left_v += i_left; + } +} + +static INLINE +void xavs2_cache_lcu_border10_uv(pel10_t *p_dst_u, const pel10_t *p_top_u, const pel10_t *p_left_u, + pel10_t *p_dst_v, const pel10_t *p_top_v, const pel10_t *p_left_v, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ - memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel_t)); - memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel_t)); + memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel10_t)); + memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel10_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst_u[-i] = p_left_u[0]; @@ -324,7 +387,7 @@ void lcu_start_init_pos(xavs2_t *h, int i_lcu_x, int i_lcu_y) p_cu_info->i_cu_qp = (int8_t)(h->i_qp); // needed in loop filter (even if constant QP is used) // reset syntax element entries in cu_info_t - // 这些元素在编码每个LCU时会设置,所以此处不需要修改 + // 杩欎簺鍏冪礌鍦ㄧ紪鐮佹瘡涓狶CU鏃朵細璁剧疆锛屾墍浠ユ澶勪笉闇瑕佷慨鏀 // p_cu_info->i_mode = PRED_SKIP; // p_cu_info->i_cbp = 0; // p_cu_info->i_level = MIN_CU_SIZE_IN_BIT; @@ -348,8 +411,9 @@ void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y) int blk_h[3]; int i_src[3]; int i_dst[3]; - pel_t *p_src[3]; - pel_t *p_dst[3]; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_src[3]; + pel8_t *p_dst[3]; /* ------------------------------------------------------------- * 1, copy LCU pixel data from original image buffer @@ -357,34 +421,73 @@ void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y) i_src[0] = h->fenc->i_stride[0]; i_src[1] = h->fenc->i_stride[1]; i_src[2] = h->fenc->i_stride[2]; - p_src[0] = h->fenc->planes[0] + (img_y ) * i_src[0] + (img_x ); - p_src[1] = h->fenc->planes[1] + (img_y >> 1) * i_src[1] + (img_x >> 1); - p_src[2] = h->fenc->planes[2] + (img_y >> 1) * i_src[2] + (img_x >> 1); + p_src[0] = h->fenc->planes8[0] + (img_y ) * i_src[0] + (img_x ); + p_src[1] = h->fenc->planes8[1] + (img_y >> 1) * i_src[1] + (img_x >> 1); + p_src[2] = h->fenc->planes8[2] + (img_y >> 1) * i_src[2] + (img_x >> 1); i_dst[0] = i_dst[1] = i_dst[2] = FENC_STRIDE; - p_dst[0] = h->lcu.p_fenc[0]; - p_dst[1] = h->lcu.p_fenc[1]; - p_dst[2] = h->lcu.p_fenc[2]; + p_dst[0] = h->lcu.p_fenc8[0]; + p_dst[1] = h->lcu.p_fenc8[1]; + p_dst[2] = h->lcu.p_fenc8[2]; blk_w[0] = lcu_width; blk_h[0] = lcu_height; blk_w[1] = blk_w[2] = lcu_width >> 1; blk_h[1] = blk_h[2] = lcu_height >> 1; - block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); + block_copy8_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); /* first CTU of LCU row */ if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { if (img_x == 0) { - memcpy(h->lcu.ctu_border[0].rec_top + 1, h->intra_border[0], lcu_width * 2 * sizeof(pel_t)); - memcpy(h->lcu.ctu_border[1].rec_top + 1, h->intra_border[1], lcu_width * sizeof(pel_t)); - memcpy(h->lcu.ctu_border[2].rec_top + 1, h->intra_border[2], lcu_width * sizeof(pel_t)); + memcpy(h->lcu.ctu_border8[0].rec_top + 1, h->intra_border8[0], lcu_width * 2 * sizeof(pel8_t)); + memcpy(h->lcu.ctu_border8[1].rec_top + 1, h->intra_border8[1], lcu_width * sizeof(pel8_t)); + memcpy(h->lcu.ctu_border8[2].rec_top + 1, h->intra_border8[2], lcu_width * sizeof(pel8_t)); } else if (h->param->i_lcurow_threads > 1) { /* top-right pixels */ - memcpy(h->lcu.ctu_border[0].rec_top + 1 + lcu_width, h->intra_border[0] + img_x + lcu_width, lcu_width * sizeof(pel_t)); - memcpy(h->lcu.ctu_border[1].rec_top + 1 + (lcu_width >> 1), h->intra_border[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t)); - memcpy(h->lcu.ctu_border[2].rec_top + 1 + (lcu_width >> 1), h->intra_border[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t)); + memcpy(h->lcu.ctu_border8[0].rec_top + 1 + lcu_width, h->intra_border8[0] + img_x + lcu_width, lcu_width * sizeof(pel8_t)); + memcpy(h->lcu.ctu_border8[1].rec_top + 1 + (lcu_width >> 1), h->intra_border8[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel8_t)); + memcpy(h->lcu.ctu_border8[2].rec_top + 1 + (lcu_width >> 1), h->intra_border8[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel8_t)); } } + } else { + pel10_t *p_src[3]; + pel10_t *p_dst[3]; + + /* ------------------------------------------------------------- + * 1, copy LCU pixel data from original image buffer + */ + i_src[0] = h->fenc->i_stride[0]; + i_src[1] = h->fenc->i_stride[1]; + i_src[2] = h->fenc->i_stride[2]; + p_src[0] = h->fenc->planes10[0] + (img_y ) * i_src[0] + (img_x ); + p_src[1] = h->fenc->planes10[1] + (img_y >> 1) * i_src[1] + (img_x >> 1); + p_src[2] = h->fenc->planes10[2] + (img_y >> 1) * i_src[2] + (img_x >> 1); + + i_dst[0] = i_dst[1] = i_dst[2] = FENC_STRIDE; + p_dst[0] = h->lcu.p_fenc10[0]; + p_dst[1] = h->lcu.p_fenc10[1]; + p_dst[2] = h->lcu.p_fenc10[2]; + + blk_w[0] = lcu_width; + blk_h[0] = lcu_height; + blk_w[1] = blk_w[2] = lcu_width >> 1; + blk_h[1] = blk_h[2] = lcu_height >> 1; + block_copy10_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); + + /* first CTU of LCU row */ + if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { + if (img_x == 0) { + memcpy(h->lcu.ctu_border10[0].rec_top + 1, h->intra_border10[0], lcu_width * 2 * sizeof(pel10_t)); + memcpy(h->lcu.ctu_border10[1].rec_top + 1, h->intra_border10[1], lcu_width * sizeof(pel10_t)); + memcpy(h->lcu.ctu_border10[2].rec_top + 1, h->intra_border10[2], lcu_width * sizeof(pel10_t)); + } else if (h->param->i_lcurow_threads > 1) { + /* top-right pixels */ + memcpy(h->lcu.ctu_border10[0].rec_top + 1 + lcu_width, h->intra_border10[0] + img_x + lcu_width, lcu_width * sizeof(pel10_t)); + memcpy(h->lcu.ctu_border10[1].rec_top + 1 + (lcu_width >> 1), h->intra_border10[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel10_t)); + memcpy(h->lcu.ctu_border10[2].rec_top + 1 + (lcu_width >> 1), h->intra_border10[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel10_t)); + } + } + } } /* --------------------------------------------------------------------------- @@ -404,8 +507,9 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y) int blk_h[3]; int i_src[3]; int i_dst[3]; - pel_t *p_src[3]; - pel_t *p_dst[3]; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_src[3]; + pel8_t *p_dst[3]; /* ------------------------------------------------------------- * 1, copy decoded LCU to frame buffer @@ -413,20 +517,20 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y) i_dst[0] = h->fdec->i_stride[0]; i_dst[1] = h->fdec->i_stride[1]; i_dst[2] = h->fdec->i_stride[2]; - p_dst[0] = h->fdec->planes[0] + (img_y) * i_dst[0] + (img_x); - p_dst[1] = h->fdec->planes[1] + (img_y_c) * i_dst[1] + (img_x_c); - p_dst[2] = h->fdec->planes[2] + (img_y_c) * i_dst[2] + (img_x_c); + p_dst[0] = h->fdec->planes8[0] + (img_y) * i_dst[0] + (img_x); + p_dst[1] = h->fdec->planes8[1] + (img_y_c) * i_dst[1] + (img_x_c); + p_dst[2] = h->fdec->planes8[2] + (img_y_c) * i_dst[2] + (img_x_c); i_src[0] = i_src[1] = i_src[2] = FDEC_STRIDE; - p_src[0] = h->lcu.p_fdec[0]; - p_src[1] = h->lcu.p_fdec[1]; - p_src[2] = h->lcu.p_fdec[2]; + p_src[0] = h->lcu.p_fdec8[0]; + p_src[1] = h->lcu.p_fdec8[1]; + p_src[2] = h->lcu.p_fdec8[2]; blk_w[0] = lcu_width; blk_h[0] = lcu_height; blk_w[1] = blk_w[2] = lcu_width_c; blk_h[1] = blk_h[2] = lcu_height_c; - block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); + block_copy8_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); /* ------------------------------------------------------------- * 2, backup right col and bottom row pixels for intra coding @@ -440,18 +544,68 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y) i_pred_mode_width_in_lcu * sizeof(int8_t)); /* cache top and left samples for intra prediction of next CTU */ - xavs2_cache_lcu_border(h->lcu.ctu_border[0].rec_top, h->intra_border[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1, + xavs2_cache_lcu_border8(h->lcu.ctu_border8[0].rec_top, h->intra_border8[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1, FDEC_STRIDE, lcu_width, lcu_height); - xavs2_cache_lcu_border_uv(h->lcu.ctu_border[1].rec_top, h->intra_border[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1, - h->lcu.ctu_border[2].rec_top, h->intra_border[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1, + xavs2_cache_lcu_border8_uv(h->lcu.ctu_border8[1].rec_top, h->intra_border8[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1, + h->lcu.ctu_border8[2].rec_top, h->intra_border8[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1, FDEC_STRIDE, lcu_width_c, lcu_height_c); /* 2.2, backup bottom row pixels */ if (i_lcu_y < h->i_height_in_lcu - 1) { - g_funcs.fast_memcpy(h->intra_border[0] + img_x, p_src[0] + (lcu_height - 1) * FDEC_STRIDE, lcu_width * sizeof(pel_t)); - g_funcs.fast_memcpy(h->intra_border[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t)); - g_funcs.fast_memcpy(h->intra_border[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t)); + g_funcs.fast_memcpy(h->intra_border8[0] + img_x, p_src[0] + (lcu_height - 1) * FDEC_STRIDE, lcu_width * sizeof(pel8_t)); + g_funcs.fast_memcpy(h->intra_border8[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel8_t)); + g_funcs.fast_memcpy(h->intra_border8[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel8_t)); } } + } else { + pel10_t *p_src[3]; + pel10_t *p_dst[3]; + + /* ------------------------------------------------------------- + * 1, copy decoded LCU to frame buffer + */ + i_dst[0] = h->fdec->i_stride[0]; + i_dst[1] = h->fdec->i_stride[1]; + i_dst[2] = h->fdec->i_stride[2]; + p_dst[0] = h->fdec->planes10[0] + (img_y) * i_dst[0] + (img_x); + p_dst[1] = h->fdec->planes10[1] + (img_y_c) * i_dst[1] + (img_x_c); + p_dst[2] = h->fdec->planes10[2] + (img_y_c) * i_dst[2] + (img_x_c); + i_src[0] = i_src[1] = i_src[2] = FDEC_STRIDE; + p_src[0] = h->lcu.p_fdec10[0]; + p_src[1] = h->lcu.p_fdec10[1]; + p_src[2] = h->lcu.p_fdec10[2]; + + blk_w[0] = lcu_width; + blk_h[0] = lcu_height; + blk_w[1] = blk_w[2] = lcu_width_c; + blk_h[1] = blk_h[2] = lcu_height_c; + block_copy10_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); + + /* ------------------------------------------------------------- + * 2, backup right col and bottom row pixels for intra coding + */ + if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { + // backup intra pred mode of bottom 4x4 row + int i_pred_mode_stride = h->i_width_in_minpu + 16; + int i_pred_mode_width_in_lcu = (1 << h->i_lcu_level) >> MIN_PU_SIZE_IN_BIT; + memcpy(h->ipredmode - i_pred_mode_stride + i_lcu_x * i_pred_mode_width_in_lcu, + h->ipredmode + i_pred_mode_stride * (i_pred_mode_width_in_lcu - 1) + i_lcu_x * i_pred_mode_width_in_lcu, + i_pred_mode_width_in_lcu * sizeof(int8_t)); + + /* cache top and left samples for intra prediction of next CTU */ + xavs2_cache_lcu_border10(h->lcu.ctu_border10[0].rec_top, h->intra_border10[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1, + FDEC_STRIDE, lcu_width, lcu_height); + xavs2_cache_lcu_border10_uv(h->lcu.ctu_border10[1].rec_top, h->intra_border10[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1, + h->lcu.ctu_border10[2].rec_top, h->intra_border10[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1, + FDEC_STRIDE, lcu_width_c, lcu_height_c); + + /* 2.2, backup bottom row pixels */ + if (i_lcu_y < h->i_height_in_lcu - 1) { + g_funcs.fast_memcpy(h->intra_border10[0] + img_x, p_src[0] + (lcu_height - 1) * FDEC_STRIDE, lcu_width * sizeof(pel10_t)); + g_funcs.fast_memcpy(h->intra_border10[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel10_t)); + g_funcs.fast_memcpy(h->intra_border10[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel10_t)); + } + } + } } diff --git a/source/common/cudata.h b/source/common/cudata.h index c867091..de8edf3 100644 --- a/source/common/cudata.h +++ b/source/common/cudata.h @@ -49,7 +49,7 @@ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y); */ static ALWAYS_INLINE int clip_qp(xavs2_t *h, int i_qp) { - /* AVS2-P2: 图像量化因子 picture_qp */ + /* AVS2-P2拢潞 脥录脧帽脕驴禄炉脪貌脳脫 picture_qp */ int max_qp = MAX_QP + (h->param->sample_bit_depth - 8) * 8; return XAVS2_MAX(MIN_QP, XAVS2_MIN(max_qp, i_qp)); } @@ -117,10 +117,21 @@ int cu_get_slice_index(xavs2_t *h, int scu_x, int scu_y) */ static ALWAYS_INLINE int cu_get_chroma_qp(xavs2_t *h, int luma_qp, int uv) { - int QP; - UNUSED_PARAMETER(uv); - UNUSED_PARAMETER(h); - QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63, luma_qp)]; + //printf("luma_qp1: %d\n", luma_qp); + int QP = luma_qp + (uv == 0 ? h->param->chroma_quant_param_delta_u : h->param->chroma_quant_param_delta_v); + //printf("luma_qp2: %d\n", QP); + //UNUSED_PARAMETER(uv); + //UNUSED_PARAMETER(h); +//if (h->param->sample_bit_depth > 8) { + const int bit_depth_offset = ((h->param->sample_bit_depth - 8) << 3); + QP -= bit_depth_offset; + //printf("bit_depth_offset: %d\n", bit_depth_offset); + QP = QP < 0 ? QP : tab_qp_scale_chroma[QP]; + //printf("QP: %d\n", QP); + QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63 + bit_depth_offset, QP + bit_depth_offset)]; +//} else { + //QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63, luma_qp)]; +//} return QP; } diff --git a/source/common/defines.h b/source/common/defines.h index 575f0ce..2815930 100644 --- a/source/common/defines.h +++ b/source/common/defines.h @@ -57,7 +57,7 @@ * =========================================================================== */ -/* 检查算法是否开启 */ +/* 妫鏌ョ畻娉曟槸鍚﹀紑鍚 */ #define IS_ALG_ENABLE(alg) ((h->i_fast_algs >> alg) & 1) /* --------------------------------------------------------------------------- @@ -65,66 +65,66 @@ */ enum xavs2_fast_algorithms_e { /* fast inter */ - OPT_EARLY_SKIP , /* 基于时空相关性的快速SKIP决策 */ - OPT_PSC_MD , /* 基于时空相关性的快速模式决策 (prediction size correlation based mode decision) */ - OPT_FAST_CBF_MODE , /* 基于最优划分模式的CBF快速跳过剩余的划分模式 */ - OPT_FAST_PU_SEL , /* OPT_FAST_CBF_MODE的简化算法,cbf=0时,若2Nx2N不优于SKIP,则跳过剩余帧间模式和帧内模式 */ - OPT_BYPASS_AMP , /* 如果PRED_2NxN未获得最优,直接跳过相同划分方向的PRED_2NxnU/PRED_2NxnD; PRED_Nx2N同理 */ - OPT_DMH_CANDIDATE , /* 用于精简DMH模式下的RDO次数 */ - OPT_BYPASS_MODE_FPIC , /* F帧中的帧内模式与DMH模式跳过 */ - OPT_ADVANCE_CHROMA_AEC , /* 提前色度块的变换系数编码过程 */ + OPT_EARLY_SKIP , /* 鍩轰簬鏃剁┖鐩稿叧鎬х殑蹇烻KIP鍐崇瓥 */ + OPT_PSC_MD , /* 鍩轰簬鏃剁┖鐩稿叧鎬х殑蹇熸ā寮忓喅绛 (prediction size correlation based mode decision) */ + OPT_FAST_CBF_MODE , /* 鍩轰簬鏈浼樺垝鍒嗘ā寮忕殑CBF蹇熻烦杩囧墿浣欑殑鍒掑垎妯″紡 */ + OPT_FAST_PU_SEL , /* OPT_FAST_CBF_MODE鐨勭畝鍖栫畻娉曪紝cbf=0鏃讹紝鑻2Nx2N涓嶄紭浜嶴KIP锛屽垯璺宠繃鍓╀綑甯ч棿妯″紡鍜屽抚鍐呮ā寮 */ + OPT_BYPASS_AMP , /* 濡傛灉PRED_2NxN鏈幏寰楁渶浼橈紝鐩存帴璺宠繃鐩稿悓鍒掑垎鏂瑰悜鐨凱RED_2NxnU/PRED_2NxnD; PRED_Nx2N鍚岀悊 */ + OPT_DMH_CANDIDATE , /* 鐢ㄤ簬绮剧畝DMH妯″紡涓嬬殑RDO娆℃暟 */ + OPT_BYPASS_MODE_FPIC , /* F甯т腑鐨勫抚鍐呮ā寮忎笌DMH妯″紡璺宠繃 */ + OPT_ADVANCE_CHROMA_AEC , /* 鎻愬墠鑹插害鍧楃殑鍙樻崲绯绘暟缂栫爜杩囩▼ */ OPT_ROUGH_MODE_SKIP , /* */ - OPT_CMS_ETMD , /* 条件跳过帧内划分方式: - * (1)若I_2Nx2N不优于帧间预测模式,则不遍历帧内其他划分; - * (2)帧间最优模式的CBP为零时跳过帧内划分方式。*/ - OPT_ROUGH_PU_SEL , /* 粗略的PU划分模式搜索 */ - OPT_CBP_DIRECT , /* 根据direct模式下残差是否为全零块,跳过PU划分和CU递归划分 */ - OPT_SKIP_DMH_THRES , /* 通过Distortion的阈值决定跳过DMH模式的遍历 */ - OPT_ROUGH_SKIP_SEL , /* 通过distortion对比只对个别skip/direct模式做RDO */ + OPT_CMS_ETMD , /* 鏉′欢璺宠繃甯у唴鍒掑垎鏂瑰紡锛 + * 锛1锛夎嫢I_2Nx2N涓嶄紭浜庡抚闂撮娴嬫ā寮忥紝鍒欎笉閬嶅巻甯у唴鍏朵粬鍒掑垎锛 + * 锛2锛夊抚闂存渶浼樻ā寮忕殑CBP涓洪浂鏃惰烦杩囧抚鍐呭垝鍒嗘柟寮忋*/ + OPT_ROUGH_PU_SEL , /* 绮楃暐鐨凱U鍒掑垎妯″紡鎼滅储 */ + OPT_CBP_DIRECT , /* 鏍规嵁direct妯″紡涓嬫畫宸槸鍚︿负鍏ㄩ浂鍧楋紝璺宠繃PU鍒掑垎鍜孋U閫掑綊鍒掑垎 */ + OPT_SKIP_DMH_THRES , /* 閫氳繃Distortion鐨勯槇鍊煎喅瀹氳烦杩嘍MH妯″紡鐨勯亶鍘 */ + OPT_ROUGH_SKIP_SEL , /* 閫氳繃distortion瀵规瘮鍙涓埆skip/direct妯″紡鍋歊DO */ /* fast intra */ - OPT_BYPASS_SDIP , /* 如果PRED_I_2Nxn已获最优,直接跳过PRED_I_nx2N */ - OPT_FAST_INTRA_MODE , /* 帧内模式快速决策 */ - OPT_FAST_RDO_INTRA_C , /* 快速帧内Chroma预测模式优化,减少色度分量决策数量 */ - OPT_ET_RDO_INTRA_L , /* Luma RDO过程提前退出策略 */ - OPT_ET_INTRA_DEPTH , /* 基于MAD值的I帧depth划分提前终止 */ - OPT_BYPASS_INTRA_BPIC , /* B帧中若帧间预测模式的CBP为零,则跳过帧内预测模式决策 */ - OPT_FAST_INTRA_IN_INTER , /* 依据子CU的最优模式是否帧内及当前CU的帧间模式RDCost禁用帧间的帧内模式 */ + OPT_BYPASS_SDIP , /* 濡傛灉PRED_I_2Nxn宸茶幏鏈浼橈紝鐩存帴璺宠繃PRED_I_nx2N */ + OPT_FAST_INTRA_MODE , /* 甯у唴妯″紡蹇熷喅绛 */ + OPT_FAST_RDO_INTRA_C , /* 蹇熷抚鍐匔hroma棰勬祴妯″紡浼樺寲锛屽噺灏戣壊搴﹀垎閲忓喅绛栨暟閲 */ + OPT_ET_RDO_INTRA_L , /* Luma RDO杩囩▼鎻愬墠閫鍑虹瓥鐣 */ + OPT_ET_INTRA_DEPTH , /* 鍩轰簬MAD鍊肩殑I甯epth鍒掑垎鎻愬墠缁堟 */ + OPT_BYPASS_INTRA_BPIC , /* B甯т腑鑻ュ抚闂撮娴嬫ā寮忕殑CBP涓洪浂锛屽垯璺宠繃甯у唴棰勬祴妯″紡鍐崇瓥 */ + OPT_FAST_INTRA_IN_INTER , /* 渚濇嵁瀛怌U鐨勬渶浼樻ā寮忔槸鍚﹀抚鍐呭強褰撳墠CU鐨勫抚闂存ā寮廟DCost绂佺敤甯ч棿鐨勫抚鍐呮ā寮 */ /* fast CU depth */ - OPT_ECU , /* HM中全零SKIP模式终止下层划分 */ + OPT_ECU , /* HM涓叏闆禨KIP妯″紡缁堟涓嬪眰鍒掑垎 */ OPT_ET_HOMO_MV , /* */ OPT_CU_CSET , /* CSET of uAVS2, Only for inter frames that are not referenced by others */ - OPT_CU_DEPTH_CTRL , /* 基于时空相关性的Depth估计,依据上、左、左上、右上和时域参考块level调整DEPTH范围,全I帧也适用 */ + OPT_CU_DEPTH_CTRL , /* 鍩轰簬鏃剁┖鐩稿叧鎬х殑Depth浼拌锛屼緷鎹笂銆佸乏銆佸乏涓娿佸彸涓婂拰鏃跺煙鍙傝冨潡level璋冩暣DEPTH鑼冨洿锛屽叏I甯т篃閫傜敤 */ OPT_CU_QSFD , /* CU splitting termination based on RD-Cost: Z. Wang, R. Wang, K. Fan, H. Sun, and W. Gao, - “uAVS2—Fast encoder for the 2nd generation IEEE 1857 video coding standard,” - Signal Process. Image Commun., vol. 53, no. October 2016, pp. 13–23, 2017. */ + 鈥渦AVS2鈥擣ast encoder for the 2nd generation IEEE 1857 video coding standard,鈥 + Signal Process. Image Commun., vol. 53, no. October 2016, pp. 13鈥23, 2017. */ /* fast transform and Quant */ - OPT_BYPASS_INTRA_RDOQ , /* 跳过B帧帧间编码中的帧内模式的RDOQ */ - OPT_RDOQ_AZPC , /* 通过对变换系数的阈值判断检测全零块进行RDOQ预处理,跳过色度分量的RDOQ过程*/ + OPT_BYPASS_INTRA_RDOQ , /* 璺宠繃B甯у抚闂寸紪鐮佷腑鐨勫抚鍐呮ā寮忕殑RDOQ */ + OPT_RDOQ_AZPC , /* 閫氳繃瀵瑰彉鎹㈢郴鏁扮殑闃堝煎垽鏂娴嬪叏闆跺潡杩涜RDOQ棰勫鐞嗭紝璺宠繃鑹插害鍒嗛噺鐨凴DOQ杩囩▼*/ /* others */ - OPT_FAST_ZBLOCK , /* 快速零块估计 */ - OPT_TR_KEY_FRAME_MD , /* 以更大概率跳过非关键帧的部分模式,能节省5%以上时间 */ - OPT_CODE_OPTIMZATION , /* OPT_CU_SUBCU_COST: 先编码大CU,再编码小CU时若前几个小CU的RDCost超过大CU的一定比率则跳过后续CU - * OPT_RDOQ_SKIP: 通过在RDOQ之前对变换系数的阈值判断检测全零块,跳过RDOQ过程 + OPT_FAST_ZBLOCK , /* 蹇熼浂鍧椾及璁 */ + OPT_TR_KEY_FRAME_MD , /* 浠ユ洿澶ф鐜囪烦杩囬潪鍏抽敭甯х殑閮ㄥ垎妯″紡锛岃兘鑺傜渷5%浠ヤ笂鏃堕棿 */ + OPT_CODE_OPTIMZATION , /* OPT_CU_SUBCU_COST: 鍏堢紪鐮佸ぇCU锛屽啀缂栫爜灏廋U鏃惰嫢鍓嶅嚑涓皬CU鐨凴DCost瓒呰繃澶U鐨勪竴瀹氭瘮鐜囧垯璺宠繃鍚庣画CU + * OPT_RDOQ_SKIP: 閫氳繃鍦≧DOQ涔嬪墠瀵瑰彉鎹㈢郴鏁扮殑闃堝煎垽鏂娴嬪叏闆跺潡锛岃烦杩嘡DOQ杩囩▼ */ - OPT_BIT_EST_PSZT , /* 快速TU比特估计:对33x32的亮度TU假定只有低频的16x16部分有非零系数 */ - OPT_TU_LEVEL_DEC , /* TU两层划分决策:对第一层TU划分选出最优,对最优做第二层TU划分,决策是否需要两层TU划分 */ - OPT_FAST_ALF , /* ALF快速算法,在顶层B帧(不被其余帧参考)禁用ALF,在所有ALF的协方差矩阵计算时,进行step=2的下采样 */ - OPT_FAST_SAO , /* SAO快速算法,在顶层B帧(不被其余帧参考)禁用SAO */ - OPT_SUBCU_SPLIT , /* 根据划分子块的数目决策父块是否对非SKIP模式做RDO */ - OPT_PU_RMS , /* 关闭小块(8x8,16x16)划分的预测单元,仅保留2Nx2N的帧内,帧间以及SKIP模式*/ - NUM_FAST_ALGS /* 总的快速算法数量 */ + OPT_BIT_EST_PSZT , /* 蹇烼U姣旂壒浼拌锛氬33x32鐨勪寒搴U鍋囧畾鍙湁浣庨鐨16x16閮ㄥ垎鏈夐潪闆剁郴鏁 */ + OPT_TU_LEVEL_DEC , /* TU涓ゅ眰鍒掑垎鍐崇瓥锛氬绗竴灞俆U鍒掑垎閫夊嚭鏈浼橈紝瀵规渶浼樺仛绗簩灞俆U鍒掑垎锛屽喅绛栨槸鍚﹂渶瑕佷袱灞俆U鍒掑垎 */ + OPT_FAST_ALF , /* ALF蹇熺畻娉曪紝鍦ㄩ《灞侭甯э紙涓嶈鍏朵綑甯у弬鑰冿級绂佺敤ALF锛屽湪鎵鏈堿LF鐨勫崗鏂瑰樊鐭╅樀璁$畻鏃讹紝杩涜step=2鐨勪笅閲囨牱 */ + OPT_FAST_SAO , /* SAO蹇熺畻娉曪紝鍦ㄩ《灞侭甯э紙涓嶈鍏朵綑甯у弬鑰冿級绂佺敤SAO */ + OPT_SUBCU_SPLIT , /* 鏍规嵁鍒掑垎瀛愬潡鐨勬暟鐩喅绛栫埗鍧楁槸鍚﹀闈濻KIP妯″紡鍋歊DO */ + OPT_PU_RMS , /* 鍏抽棴灏忓潡锛8x8,16x16)鍒掑垎鐨勯娴嬪崟鍏冿紝浠呬繚鐣2Nx2N鐨勫抚鍐咃紝甯ч棿浠ュ強SKIP妯″紡*/ + NUM_FAST_ALGS /* 鎬荤殑蹇熺畻娉曟暟閲 */ }; /* --------------------------------------------------------------------------- * const defines related with fast algorithms */ -#define SAVE_CU_INFO 1 /* 保存参考帧队列里的每一帧的cu type和cu bitsize,用于获取时域的cu模式和cu尺寸 */ +#define SAVE_CU_INFO 1 /* 淇濆瓨鍙傝冨抚闃熷垪閲岀殑姣忎竴甯х殑cu type鍜宑u bitsize锛岀敤浜庤幏鍙栨椂鍩熺殑cu妯″紡鍜宑u灏哄 */ #define NUM_INTRA_C_FULL_RD 4 /* --------------------------------------------------------------------------- @@ -144,7 +144,7 @@ enum xavs2_fast_algorithms_e { */ #define ENABLE_RATE_CONTROL_CU 0 /* Enable Rate-Control on CU level: 1: enable, 0: disable */ -#define ENABLE_AUTO_INIT_QP 1 /* 根据目标码率自动设置初始QP值 */ +#define ENABLE_AUTO_INIT_QP 1 /* 鏍规嵁鐩爣鐮佺巼鑷姩璁剧疆鍒濆QP鍊 */ /** @@ -224,16 +224,16 @@ enum xavs2_fast_algorithms_e { #define LAM_2Level_TU 0.8 #define DMH_MODE_NUM 5 /* number of DMH mode */ #define WPM_NUM 3 /* number of WPM */ -#define TH_PMVR 2 /* PMVR中四分之一像素精度MV的可用范围 */ +#define TH_PMVR 2 /* PMVR涓洓鍒嗕箣涓鍍忕礌绮惧害MV鐨勫彲鐢ㄨ寖鍥 */ /* --------------------------------------------------------------------------- * coefficient coding */ -#define MAX_TU_SIZE 32 /* 最大变换块大小,熵编码时的系数矩阵 */ -#define MAX_TU_SIZE_IN_BIT 5 /* 最大变换块大小,熵编码时的系数矩阵 */ -#define SIZE_CG 4 /* CG 大小 4x4 */ -#define SIZE_CG_IN_BIT 2 /* CG 大小 4x4 */ +#define MAX_TU_SIZE 32 /* 鏈澶у彉鎹㈠潡澶у皬锛岀喌缂栫爜鏃剁殑绯绘暟鐭╅樀 */ +#define MAX_TU_SIZE_IN_BIT 5 /* 鏈澶у彉鎹㈠潡澶у皬锛岀喌缂栫爜鏃剁殑绯绘暟鐭╅樀 */ +#define SIZE_CG 4 /* CG 澶у皬 4x4 */ +#define SIZE_CG_IN_BIT 2 /* CG 澶у皬 4x4 */ #define MAX_CG_NUM_IN_TU (1 << ((MAX_TU_SIZE_IN_BIT - SIZE_CG_IN_BIT) << 1)) /* --------------------------------------------------------------------------- @@ -247,14 +247,14 @@ enum xavs2_fast_algorithms_e { /* --------------------------------------------------------------------------- * SAO (Sample Adaptive Offset) */ -#define NUM_BO_OFFSET 32 /*BO模式下offset数量,其中最多4个非零*/ -#define MAX_NUM_SAO_CLASSES 32 /*最大offset数量*/ +#define NUM_BO_OFFSET 32 /*BO妯″紡涓媜ffset鏁伴噺锛屽叾涓渶澶4涓潪闆*/ +#define MAX_NUM_SAO_CLASSES 32 /*鏈澶ffset鏁伴噺*/ #define NUM_SAO_BO_CLASSES_LOG2 5 /**/ #define NUM_SAO_BO_CLASSES_IN_BIT 5 /**/ -#define NUM_SAO_BO_CLASSES (1 << NUM_SAO_BO_CLASSES_LOG2) /*BO模式下startband数目*/ -#define SAO_RATE_THR 1.0 /*亮度分量,用于RDO决策*/ -#define SAO_RATE_CHROMA_THR 1.0 /*色度分量,用于RDO决策*/ -#define SAO_SHIFT_PIX_NUM 4 /*SAO向左上偏移的像素点数*/ +#define NUM_SAO_BO_CLASSES (1 << NUM_SAO_BO_CLASSES_LOG2) /*BO妯″紡涓媠tartband鏁扮洰*/ +#define SAO_RATE_THR 1.0 /*浜害鍒嗛噺锛岀敤浜嶳DO鍐崇瓥*/ +#define SAO_RATE_CHROMA_THR 1.0 /*鑹插害鍒嗛噺锛岀敤浜嶳DO鍐崇瓥*/ +#define SAO_SHIFT_PIX_NUM 4 /*SAO鍚戝乏涓婂亸绉荤殑鍍忕礌鐐规暟*/ #define MAX_DOUBLE 1.7e+308 @@ -302,7 +302,7 @@ enum xavs2_fast_algorithms_e { #define MAX_SLICES 8 /* max number of slices in one picture */ #define MAX_PARALLEL_FRAMES 8 /* max number of parallel encoding frames */ #define MAX_COI_VALUE ((1<<8) - 1) /* max COI value (unsigned char) */ -#define PIXEL_MAX ((1< ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + const int pel_add = 1 << (ALF_NUM_BIT_SHIFT - 1); + const int pel_max = (1 << h->param->input_sample_bit_depth) - 1; + int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); - int xPosEnd = lcu_pix_x + lcu_width; - int min_x = lcu_pix_x - 3; - int max_x = xPosEnd - 1 + 3; + int min_x, max_x, xPosEnd; + min_x = -3; + max_x = lcu_width - 1 + 3; int yUp, yBottom; int xLeft, xRight; int x, y, pel_val; - pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; + pel8_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; - p_src += (startPos * i_src); - p_dst += (startPos * i_dst); + lcu_height = endPos - startPos; + lcu_height--; + + p_src += (startPos * i_src) + lcu_pix_x; + p_dst += (startPos * i_dst) + lcu_pix_x; for (y = startPos; y < endPos; y++) { yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 1); @@ -104,22 +111,94 @@ void alf_filter_block1(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_val += alf_coeff[8] * (p_src [x ]); pel_val = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT; - p_dst[x] = (pel_t)XAVS2_CLIP1(pel_val); + p_dst[x] = (pel8_t)XAVS2_CLIP1(pel_val); + } + p_src += i_src; + p_dst += i_dst; + } +#undef XAVS2_CLIP1 +} + +static +void alf_filter10_block1(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int *alf_coeff, int b_top_avail, int b_down_avail) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + const int pel_add = 1 << (ALF_NUM_BIT_SHIFT - 1); + const int pel_max = (1 << h->param->input_sample_bit_depth) - 1; + + int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; + int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); + int min_x, max_x, xPosEnd; + xPosEnd = lcu_pix_x + lcu_width; + min_x = lcu_pix_x - 3; + max_x = xPosEnd - 1 + 3; + int yUp, yBottom; + int xLeft, xRight; + int x, y, pel_val; + pel10_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; + + p_src += (startPos * i_src); + p_dst += (startPos * i_dst); + + for (y = 0; y <= lcu_height; y++) { + yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 1); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 1); + p_src1 = p_src + (yBottom - y) * i_src; + p_src2 = p_src + (yUp - y) * i_src; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 2); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 2); + p_src3 = p_src + (yBottom - y) * i_src; + p_src4 = p_src + (yUp - y) * i_src; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 3); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 3); + p_src5 = p_src + (yBottom - y) * i_src; + p_src6 = p_src + (yUp - y) * i_src; + + for (x = 0; x < lcu_width; x++) { + pel_val = alf_coeff[0] * (p_src5[x] + p_src6[x]); + pel_val += alf_coeff[1] * (p_src3[x] + p_src4[x]); + + xLeft = XAVS2_CLIP3(min_x, max_x, x - 1); + xRight = XAVS2_CLIP3(min_x, max_x, x + 1); + pel_val += alf_coeff[2] * (p_src1[xRight] + p_src2[xLeft ]); + pel_val += alf_coeff[3] * (p_src1[x ] + p_src2[x ]); + pel_val += alf_coeff[4] * (p_src1[xLeft ] + p_src2[xRight]); + pel_val += alf_coeff[7] * (p_src [xRight] + p_src [xLeft ]); + + xLeft = XAVS2_CLIP3(min_x, max_x, x - 2); + xRight = XAVS2_CLIP3(min_x, max_x, x + 2); + pel_val += alf_coeff[6] * (p_src [xRight] + p_src [xLeft ]); + + xLeft = XAVS2_CLIP3(min_x, max_x, x - 3); + xRight = XAVS2_CLIP3(min_x, max_x, x + 3); + pel_val += alf_coeff[5] * (p_src [xRight] + p_src [xLeft ]); + pel_val += alf_coeff[8] * (p_src [x ]); + + pel_val = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, pel_max, pel_val); } p_src += i_src; p_dst += i_dst; } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static -void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +void alf_filter8_block2(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { - pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + pel8_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; int pixelInt; int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); @@ -147,7 +226,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); - p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); + p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; @@ -172,7 +251,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); - p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); + p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt); } /* last line */ @@ -200,7 +279,7 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); - p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); + p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; @@ -225,22 +304,154 @@ void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); - p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); + p_dst[0] = (pel8_t)XAVS2_CLIP1(pixelInt); } +#undef XAVS2_CLIP1 +} + +static +void alf_filter10_block2(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int *alf_coeff, int b_top_avail, int b_down_avail) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + pel10_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; + int pixelInt; + int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; + int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); + + /* first line */ + p_src += (startPos * i_src) + lcu_pix_x; + p_dst += (startPos * i_dst) + lcu_pix_x; + + if (p_src[0] != p_src[-1]) { + p_src1 = p_src + 1 * i_src; + p_src2 = p_src; + p_src3 = p_src + 2 * i_src; + p_src4 = p_src; + p_src5 = p_src + 3 * i_src; + p_src6 = p_src; + + pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); + pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); + pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[ 0]); + pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); + pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); + pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); + pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); + pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); + pixelInt += alf_coeff[8] * (p_src [ 0]); + + pixelInt = (int)((pixelInt + 32) >> 6); + p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt); + } + + p_src += lcu_width - 1; + p_dst += lcu_width - 1; + + if (p_src[0] != p_src[1]) { + p_src1 = p_src + 1 * i_src; + p_src2 = p_src; + p_src3 = p_src + 2 * i_src; + p_src4 = p_src; + p_src5 = p_src + 3 * i_src; + p_src6 = p_src; + + pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); + pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); + pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); + pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); + pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 0]); + pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); + pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); + pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); + pixelInt += alf_coeff[8] * (p_src [ 0]); + + pixelInt = (int)((pixelInt + 32) >> 6); + p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt); + } + + /* last line */ + p_src -= lcu_width - 1; + p_dst -= lcu_width - 1; + p_src += ((endPos - startPos - 1) * i_src); + p_dst += ((endPos - startPos - 1) * i_dst); + + if (p_src[0] != p_src[-1]) { + p_src1 = p_src; + p_src2 = p_src - 1 * i_src; + p_src3 = p_src; + p_src4 = p_src - 2 * i_src; + p_src5 = p_src; + p_src6 = p_src - 3 * i_src; + + pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); + pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); + pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); + pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); + pixelInt += alf_coeff[4] * (p_src1[ 0] + p_src2[ 1]); + pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); + pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); + pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); + pixelInt += alf_coeff[8] * (p_src [ 0]); + + pixelInt = (int)((pixelInt + 32) >> 6); + p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt); + } + + p_src += lcu_width - 1; + p_dst += lcu_width - 1; + + if (p_src[0] != p_src[1]) { + p_src1 = p_src; + p_src2 = p_src - 1 * i_src; + p_src3 = p_src; + p_src4 = p_src - 2 * i_src; + p_src5 = p_src; + p_src6 = p_src - 3 * i_src; + + pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); + pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); + pixelInt += alf_coeff[2] * (p_src1[ 0] + p_src2[-1]); + pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); + pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); + pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); + pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); + pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); + pixelInt += alf_coeff[8] * (p_src [ 0]); + + pixelInt = (int)((pixelInt + 32) >> 6); + p_dst[0] = (pel10_t)XAVS2_CLIP1(pixelInt); + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ -void xavs2_alf_init(uint32_t cpuid, intrinsic_func_t *pf) +void xavs2_alf_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf) { + if (param->input_sample_bit_depth == 8) { /* set function handles */ - pf->alf_flt[0] = alf_filter_block1; - pf->alf_flt[1] = alf_filter_block2; + pf->alf_flt8[0] = alf_filter8_block1; + pf->alf_flt8[1] = alf_filter8_block2; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { - pf->alf_flt[0] = alf_flt_one_block_sse128; + pf->alf_flt8[0] = alf_flt_one_block_sse128; } #else UNUSED_PARAMETER(cpuid); #endif + } else { + /* set function handles */ + pf->alf_flt10[0] = alf_filter10_block1; + pf->alf_flt10[1] = alf_filter10_block2; +#if HAVE_MMX + if (cpuid & XAVS2_CPU_SSE42) { + pf->alf_flt10[0] = alf_flt_one_block_sse128; + } +#else + UNUSED_PARAMETER(cpuid); +#endif + } } diff --git a/source/common/filter_deblock.c b/source/common/filter_deblock.c index 55edc37..7434511 100644 --- a/source/common/filter_deblock.c +++ b/source/common/filter_deblock.c @@ -290,7 +290,7 @@ uint8_t lf_skip_filter(xavs2_t *h, cu_info_t *MbP, cu_info_t *MbQ, int dir, int /* --------------------------------------------------------------------------- */ -static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag) +static void lf_edge_core8(pel8_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag) { int pel; int abs_delta; @@ -347,26 +347,114 @@ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int al switch (fs) { case 4: - src[-inc1] = (pel_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5); // L0 - src[-inc2] = (pel_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4); // L1 - src[-inc3] = (pel_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3); // L2 - src[ 0] = (pel_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5); // R0 - src[ inc1] = (pel_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4); // R1 - src[ inc2] = (pel_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3); // R2 + src[-inc1] = (pel8_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5); // L0 + src[-inc2] = (pel8_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4); // L1 + src[-inc3] = (pel8_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3); // L2 + src[ 0] = (pel8_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5); // R0 + src[ inc1] = (pel8_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4); // R1 + src[ inc2] = (pel8_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3); // R2 break; case 3: - src[-inc1] = (pel_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4); // L0 - src[ 0] = (pel_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4); // R0 - src[-inc2] = (pel_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4); - src[ inc1] = (pel_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4); + src[-inc1] = (pel8_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4); // L0 + src[ 0] = (pel8_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4); // R0 + src[-inc2] = (pel8_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4); + src[ inc1] = (pel8_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4); break; case 2: - src[-inc1] = (pel_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4); - src[ 0] = (pel_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4); + src[-inc1] = (pel8_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4); + src[ 0] = (pel8_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4); break; case 1: - src[-inc1] = (pel_t)((L0 * 3 + R0 + 2) >> 2); - src[ 0] = (pel_t)((R0 * 3 + L0 + 2) >> 2); + src[-inc1] = (pel8_t)((L0 * 3 + R0 + 2) >> 2); + src[ 0] = (pel8_t)((R0 * 3 + L0 + 2) >> 2); + break; + default: + break; + } + } + + src += ptr_inc; // next row or column + pel += b_chroma; + } +} + +static void lf_edge_core10(pel10_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag) +{ + int pel; + int abs_delta; + int L2, L1, L0, R0, R1, R2; + int fs; // fs stands for filtering strength. The larger fs is, the stronger filter is applied. + int FlatnessL, FlatnessR; + int inc2, inc3; + int flag = 0; + + inc2 = inc1 << 1; + inc3 = inc1 + inc2; + for (pel = 0; pel < MIN_CU_SIZE; pel++) { + L2 = src[-inc3]; + L1 = src[-inc2]; + L0 = src[-inc1]; + R0 = src[ 0]; + R1 = src[ inc1]; + R2 = src[ inc2]; + + abs_delta = XAVS2_ABS(R0 - L0); + flag = (pel < 4) ? flt_flag[0] : flt_flag[1]; + if (flag && (abs_delta < alpha) && (abs_delta > 1)) { + FlatnessL = (XAVS2_ABS(L1 - L0) < beta) ? 2 : 0; + if (XAVS2_ABS(L2 - L0) < beta) { + FlatnessL += 1; + } + + FlatnessR = (XAVS2_ABS(R0 - R1) < beta) ? 2 : 0; + if (XAVS2_ABS(R0 - R2) < beta) { + FlatnessR += 1; + } + + switch (FlatnessL + FlatnessR) { + case 6: + fs = (R1 == R0 && L0 == L1) ? 4 : 3; + break; + case 5: + fs = (R1 == R0 && L0 == L1) ? 3 : 2; + break; + case 4: + fs = (FlatnessL == 2) ? 2 : 1; + break; + case 3: + fs = (XAVS2_ABS(L1 - R1) < beta) ? 1 : 0; + break; + default: + fs = 0; + break; + } + + if (b_chroma && fs > 0) { + fs--; + } + + switch (fs) { + case 4: + src[-inc1] = (pel10_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5); // L0 + src[-inc2] = (pel10_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4); // L1 + src[-inc3] = (pel10_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3); // L2 + src[ 0] = (pel10_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5); // R0 + src[ inc1] = (pel10_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4); // R1 + src[ inc2] = (pel10_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3); // R2 + break; + case 3: + src[-inc1] = (pel10_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4); // L0 + src[ 0] = (pel10_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4); // R0 + src[-inc2] = (pel10_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4); + src[ inc1] = (pel10_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4); + break; + case 2: + src[-inc1] = (pel10_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4); + src[ 0] = (pel10_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4); + break; + case 1: + src[-inc1] = (pel10_t)((L0 * 3 + R0 + 2) >> 2); + src[ 0] = (pel10_t)((R0 * 3 + L0 + 2) >> 2); break; default: break; @@ -380,45 +468,113 @@ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int al /* --------------------------------------------------------------------------- */ -static void deblock_edge_hor(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) +static void deblock_edge_hor8(xavs2_t *h, pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { - lf_edge_core(src, 0, 1, stride, alpha, beta, flt_flag); + lf_edge_core8(src, 0, 1, stride, alpha, beta, flt_flag); +} + +static void deblock_edge_hor10(xavs2_t *h, pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) +{ + lf_edge_core10(src, 0, 1, stride, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ -static void deblock_edge_ver(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) +static void deblock_edge_ver8(xavs2_t *h, pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { - lf_edge_core(src, 0, stride, 1, alpha, beta, flt_flag); + lf_edge_core8(src, 0, stride, 1, alpha, beta, flt_flag); +} + +static void deblock_edge_ver10(xavs2_t *h, pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) +{ + lf_edge_core10(src, 0, stride, 1, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ -static void deblock_edge_ver_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) +static void deblock_edge_ver8_c(xavs2_t *h, pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) +{ + lf_edge_core8(src_u, 1, stride, 1, alpha, beta, flt_flag); + lf_edge_core8(src_v, 1, stride, 1, alpha, beta, flt_flag); +} + +static void deblock_edge_ver10_c(xavs2_t *h, pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) { - lf_edge_core(src_u, 1, stride, 1, alpha, beta, flt_flag); - lf_edge_core(src_v, 1, stride, 1, alpha, beta, flt_flag); + lf_edge_core10(src_u, 1, stride, 1, alpha, beta, flt_flag); + lf_edge_core10(src_v, 1, stride, 1, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ -static void deblock_edge_hor_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) +static void deblock_edge_hor8_c(xavs2_t *h, pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) { - lf_edge_core(src_u, 1, 1, stride, alpha, beta, flt_flag); - lf_edge_core(src_v, 1, 1, stride, alpha, beta, flt_flag); + lf_edge_core8(src_u, 1, 1, stride, alpha, beta, flt_flag); + lf_edge_core8(src_v, 1, 1, stride, alpha, beta, flt_flag); +} + +static void deblock_edge_hor10_c(xavs2_t *h, pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) +{ + lf_edge_core10(src_u, 1, 1, stride, alpha, beta, flt_flag); + lf_edge_core10(src_v, 1, 1, stride, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static -void lf_scu_deblock(xavs2_t *h, pel_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir) +void lf_scu_deblock8(xavs2_t *h, pel8_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir) +{ +#define MAX_QP_DEBLOCK MAX_QP + cu_info_t *MbQ = &h->cu_info[scu_y * h->i_width_in_mincu + scu_x]; /* current SCU */ + int edge_type = h->p_deblock_flag[dir][(scu_y - h->lcu.i_scu_y) * h->i_width_in_mincu + scu_x]; + + if (edge_type != EDGE_TYPE_NOFILTER) { + pel8_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT); + cu_info_t *MbP = dir ? (MbQ - h->i_width_in_mincu) : (MbQ - 1); /* MbP = Mb of the remote 4x4 block */ + int QP = (cu_get_qp(h, MbP) + cu_get_qp(h, MbQ) + 1) >> 1; /* average QP of the two blocks */ + int shift = h->param->sample_bit_depth - 8; + int offset = shift << 3; /* coded as 10/12 bit, QP is added by (8 * (h->param->sample_bit_depth - 8)) in config file */ + int alpha, beta; + uint8_t b_filter_edge[2]; + + b_filter_edge[0] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1), (scu_y << 1)); + b_filter_edge[1] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1) + dir, (scu_y << 1) + !dir); + + if (b_filter_edge[0] == 0 && b_filter_edge[1] == 0) { + return; + } + + /* deblock luma edge */ + alpha = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->alpha_c_offset)] << shift; + beta = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->beta_offset)] << shift; + + g_funcs.deblock_luma8[dir](h, src_y, i_stride, alpha, beta, b_filter_edge); + + assert(h->param->chroma_format == CHROMA_420 || h->param->chroma_format == CHROMA_400); /* only support I420/I400 now */ + /* deblock chroma edge */ + if (edge_type == EDGE_TYPE_BOTH && h->param->chroma_format == CHROMA_420) + if ((((scu_y & 1) == 0) && dir) || (((scu_x & 1) == 0) && (!dir))) { + pel8_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); + pel8_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); + + int alpha_c, beta_c; + QP = cu_get_chroma_qp(h, QP, 0) - offset; + alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->alpha_c_offset)] << shift; + beta_c = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->beta_offset)] << shift; + g_funcs.deblock_chroma8[dir](h, src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge); + } + } +#undef MAX_QP_DEBLOCK +} + +static +void lf_scu_deblock10(xavs2_t *h, pel10_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir) { - static const int max_qp_deblock = 63; +#define MAX_QP_DEBLOCK (MAX_QP + (h->param->sample_bit_depth - 8) * 8) cu_info_t *MbQ = &h->cu_info[scu_y * h->i_width_in_mincu + scu_x]; /* current SCU */ int edge_type = h->p_deblock_flag[dir][(scu_y - h->lcu.i_scu_y) * h->i_width_in_mincu + scu_x]; if (edge_type != EDGE_TYPE_NOFILTER) { - pel_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT); + pel10_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT); cu_info_t *MbP = dir ? (MbQ - h->i_width_in_mincu) : (MbQ - 1); /* MbP = Mb of the remote 4x4 block */ int QP = (cu_get_qp(h, MbP) + cu_get_qp(h, MbQ) + 1) >> 1; /* average QP of the two blocks */ int shift = h->param->sample_bit_depth - 8; @@ -434,25 +590,26 @@ void lf_scu_deblock(xavs2_t *h, pel_t *p_rec[3], int i_stride, int i_stride_c, i } /* deblock luma edge */ - alpha = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->alpha_c_offset)] << shift; - beta = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->beta_offset)] << shift; + alpha = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->alpha_c_offset)] << shift; + beta = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP - offset + h->param->beta_offset)] << shift; - g_funcs.deblock_luma[dir](src_y, i_stride, alpha, beta, b_filter_edge); + g_funcs.deblock_luma10[dir](h, src_y, i_stride, alpha, beta, b_filter_edge); assert(h->param->chroma_format == CHROMA_420 || h->param->chroma_format == CHROMA_400); /* only support I420/I400 now */ /* deblock chroma edge */ if (edge_type == EDGE_TYPE_BOTH && h->param->chroma_format == CHROMA_420) if ((((scu_y & 1) == 0) && dir) || (((scu_x & 1) == 0) && (!dir))) { - pel_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); - pel_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); + pel10_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); + pel10_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); int alpha_c, beta_c; QP = cu_get_chroma_qp(h, QP, 0) - offset; - alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->alpha_c_offset)] << shift; - beta_c = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->beta_offset)] << shift; - g_funcs.deblock_chroma[dir](src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge); + alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->alpha_c_offset)] << shift; + beta_c = tab_deblock_beta [XAVS2_CLIP3(0, MAX_QP_DEBLOCK, QP + h->param->beta_offset)] << shift; + g_funcs.deblock_chroma10[dir](h, src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge); } } +#undef MAX_QP_DEBLOCK } /** @@ -491,7 +648,11 @@ void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm) /* deblock all vertical edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { - lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER); + if (h->param->input_sample_bit_depth == 8) { + lf_scu_deblock8(h, frm->planes8, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER); + } else { + lf_scu_deblock10(h, frm->planes10, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER); + } } } @@ -512,35 +673,70 @@ void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm) /* deblock all horizontal edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { - lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR); + if (h->param->input_sample_bit_depth == 8) { + lf_scu_deblock8(h, frm->planes8, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR); + } else { + lf_scu_deblock10(h, frm->planes10, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR); + } } } } /* --------------------------------------------------------------------------- */ -void xavs2_deblock_init(uint32_t cpuid, intrinsic_func_t* lf) +void xavs2_deblock_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t* lf) { - lf->deblock_luma [0] = deblock_edge_ver; - lf->deblock_luma [1] = deblock_edge_hor; - lf->deblock_chroma[0] = deblock_edge_ver_c; - lf->deblock_chroma[1] = deblock_edge_hor_c; + if (param->input_sample_bit_depth == 8) { + lf->deblock_luma8 [0] = deblock_edge_ver8; + lf->deblock_luma8 [1] = deblock_edge_hor8; + lf->deblock_chroma8[0] = deblock_edge_ver8_c; + lf->deblock_chroma8[1] = deblock_edge_hor8_c; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { - lf->deblock_luma[0] = deblock_edge_ver_sse128; - lf->deblock_luma[1] = deblock_edge_hor_sse128; - // lf->deblock_chroma[0] = deblock_edge_ver_c_sse128; - // lf->deblock_chroma[1] = deblock_edge_hor_c_sse128; + lf->deblock_luma8[0] = deblock_edge_ver_sse128; + lf->deblock_luma8[1] = deblock_edge_hor_sse128; + // lf->deblock_chroma8[0] = deblock_edge_ver_c_sse128; + // lf->deblock_chroma8[1] = deblock_edge_hor_c_sse128; } + +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { // In some machines, avx is slower than SSE - // lf->deblock_luma[0] = deblock_edge_ver_avx2; - // lf->deblock_luma[1] = deblock_edge_hor_avx2; - // lf->deblock_chroma[0] = deblock_edge_ver_c_avx2; - // lf->deblock_chroma[1] = deblock_edge_hor_c_avx2; + // lf->deblock_luma8[0] = deblock_edge_ver_avx2; + // lf->deblock_luma8[1] = deblock_edge_hor_avx2; + // lf->deblock_chroma8[0] = deblock_edge_ver_c_avx2; + // lf->deblock_chroma8[1] = deblock_edge_hor_c_avx2; } +#endif #else UNUSED_PARAMETER(cpuid); #endif + } else { + lf->deblock_luma10 [0] = deblock_edge_ver10; + lf->deblock_luma10 [1] = deblock_edge_hor10; + lf->deblock_chroma10[0] = deblock_edge_ver10_c; + lf->deblock_chroma10[1] = deblock_edge_hor10_c; + +#if HAVE_MMX + if (cpuid & XAVS2_CPU_SSE42) { + lf->deblock_luma10[0] = deblock_edge_ver_sse128; + lf->deblock_luma10[1] = deblock_edge_hor_sse128; + // lf->deblock_chroma10[0] = deblock_edge_ver_c_sse128; + // lf->deblock_chroma10[1] = deblock_edge_hor_c_sse128; + } + +#if defined(__AVX2__) + if (cpuid & XAVS2_CPU_AVX2) { + // In some machines, avx is slower than SSE + // lf->deblock_luma10[0] = deblock_edge_ver_avx2; + // lf->deblock_luma10[1] = deblock_edge_hor_avx2; + // lf->deblock_chroma10[0] = deblock_edge_ver_c_avx2; + // lf->deblock_chroma10[1] = deblock_edge_hor_c_avx2; + } +#endif +#else + UNUSED_PARAMETER(cpuid); +#endif + } } diff --git a/source/common/filter_sao.c b/source/common/filter_sao.c index d813ee9..022d745 100644 --- a/source/common/filter_sao.c +++ b/source/common/filter_sao.c @@ -48,14 +48,14 @@ /* --------------------------------------------------------------------------- */ -static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +static void sao_block8_c(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param) { int8_t SIGN_BUF[MAX_CU_SIZE + 32]; // sign of top line int8_t *UPROW_S = SIGN_BUF + 16; int *sao_offset = sao_param->offset; - const int max_pel_val = (1 << g_bit_depth) - 1; + const int max_pel_val = (1 << h->param->input_sample_bit_depth) - 1; int reg = 0; int sx, sy, ex, ey; // start/end (x, y) int sx_0, ex_0, sx_n, ex_n; // start/end x for first and last row @@ -75,7 +75,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, right_sign = xavs2_sign3(p_src[x] - p_src[x + 1]); edge_type = left_sign + right_sign + 2; left_sign = -right_sign; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; @@ -92,7 +92,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + top_sign + 2; top_sign = -down_sign; - p_dst[y * i_dst + x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]); + p_dst[y * i_dst + x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]); } } break; @@ -115,7 +115,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x + 1] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows @@ -131,7 +131,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x] = (int8_t)reg; reg = -down_sign; } @@ -151,7 +151,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } break; case SAO_TYPE_EO_45: @@ -172,7 +172,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x - 1] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows @@ -188,7 +188,7 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x - 1] = (int8_t)(-down_sign); } } @@ -207,15 +207,197 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } break; case SAO_TYPE_BO: - pel_diff = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; + pel_diff = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x++) { edge_type = p_src[x] >> pel_diff; - p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + p_dst[x] = (pel8_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + p_src += i_src; + p_dst += i_dst; + } + break; + default: + xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported SAO types."); + assert(0); + exit(-1); + } +} + +static void sao_block10_c(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, + int i_block_w, int i_block_h, + int *lcu_avail, SAOBlkParam *sao_param) +{ + int8_t SIGN_BUF[MAX_CU_SIZE + 32]; // sign of top line + int8_t *UPROW_S = SIGN_BUF + 16; + int *sao_offset = sao_param->offset; + const int max_pel_val = (1 << h->param->input_sample_bit_depth) - 1; + int reg = 0; + int sx, sy, ex, ey; // start/end (x, y) + int sx_0, ex_0, sx_n, ex_n; // start/end x for first and last row + int left_sign, right_sign, top_sign, down_sign; + int edge_type; + int pel_diff; + int x, y; + + assert(sao_param->typeIdc != SAO_TYPE_OFF); + switch (sao_param->typeIdc) { + case SAO_TYPE_EO_0: + sx = lcu_avail[SAO_L] ? 0 : 1; + ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + for (y = 0; y < i_block_h; y++) { + pel_diff = p_src[sx] - p_src[sx - 1]; + left_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0); + for (x = sx; x < ex; x++) { + pel_diff = p_src[x] - p_src[x + 1]; + right_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0); + edge_type = left_sign + right_sign + 2; + left_sign = -right_sign; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + p_src += i_src; + p_dst += i_dst; + } + break; + case SAO_TYPE_EO_90: { + sy = lcu_avail[SAO_T] ? 0 : 1; + ey = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); + for (x = 0; x < i_block_w; x++) { + pel_diff = p_src[sy * i_src + x] - p_src[(sy - 1) * i_src + x]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + for (y = sy; y < ey; y++) { + pel_diff = p_src[y * i_src + x] - p_src[(y + 1) * i_src + x]; + down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = down_sign + top_sign + 2; + top_sign = -down_sign; + p_dst[y * i_dst + x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]); + } + } + break; + } + case SAO_TYPE_EO_135: + sx = lcu_avail[SAO_L] ? 0 : 1; + ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + + // init the line buffer + for (x = sx; x < ex; x++) { + pel_diff = p_src[i_src + x + 1] - p_src[x]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x + 1] = (int8_t)top_sign; + } + + // first row + sx_0 = lcu_avail[SAO_TL] ? 0 : 1; + ex_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; + for (x = sx_0; x < ex_0; x++) { + pel_diff = p_src[x] - p_src[-i_src + x - 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = top_sign - UPROW_S[x + 1] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + + // middle rows + for (y = 1; y < i_block_h - 1; y++) { + p_src += i_src; + p_dst += i_dst; + for (x = sx; x < ex; x++) { + if (x == sx) { + pel_diff = p_src[x] - p_src[-i_src + x - 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x] = (int8_t)top_sign; + } + pel_diff = p_src[x] - p_src[i_src + x + 1]; + down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = down_sign + UPROW_S[x] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + UPROW_S[x] = (int8_t)reg; + reg = -down_sign; + } + } + + // last row + sx_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); + ex_n = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); + p_src += i_src; + p_dst += i_dst; + for (x = sx_n; x < ex_n; x++) { + if (x == sx) { + pel_diff = p_src[x] - p_src[-i_src + x - 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x] = (int8_t)top_sign; + } + pel_diff = p_src[x] - p_src[i_src + x + 1]; + down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = down_sign + UPROW_S[x] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + break; + case SAO_TYPE_EO_45: + sx = lcu_avail[SAO_L] ? 0 : 1; + ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + + // init the line buffer + for (x = sx; x < ex; x++) { + pel_diff = p_src[i_src + x - 1] - p_src[x]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x - 1] = (int8_t)top_sign; + } + + // first row + sx_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); + ex_0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); + for (x = sx_0; x < ex_0; x++) { + pel_diff = p_src[x] - p_src[-i_src + x + 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = top_sign - UPROW_S[x - 1] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + + // middle rows + for (y = 1; y < i_block_h - 1; y++) { + p_src += i_src; + p_dst += i_dst; + for (x = sx; x < ex; x++) { + if (x == ex - 1) { + pel_diff = p_src[x] - p_src[-i_src + x + 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x] = (int8_t)top_sign; + } + pel_diff = p_src[x] - p_src[i_src + x - 1]; + down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = down_sign + UPROW_S[x] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + UPROW_S[x - 1] = (int8_t)(-down_sign); + } + } + + // last row + sx_n = lcu_avail[SAO_DL] ? 0 : 1; + ex_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; + p_src += i_src; + p_dst += i_dst; + for (x = sx_n; x < ex_n; x++) { + if (x == ex - 1) { + pel_diff = p_src[x] - p_src[-i_src + x + 1]; + top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + UPROW_S[x] = (int8_t)top_sign; + } + pel_diff = p_src[x] - p_src[i_src + x - 1]; + down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); + edge_type = down_sign + UPROW_S[x] + 2; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); + } + break; + case SAO_TYPE_BO: + pel_diff = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; + for (y = 0; y < i_block_h; y++) { + for (x = 0; x < i_block_w; x++) { + edge_type = p_src[x] >> pel_diff; + p_dst[x] = (pel10_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; @@ -230,17 +412,31 @@ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, /* --------------------------------------------------------------------------- */ -void xavs2_sao_init(uint32_t cpuid, intrinsic_func_t *pf) +void xavs2_sao_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf) { - pf->sao_block = sao_block_c; + if (param->input_sample_bit_depth == 8) { + pf->sao_block8 = sao_block8_c; +#if HAVE_MMX + if (cpuid & XAVS2_CPU_SSE4) { + pf->sao_block8 = SAO_on_block_sse128; + } +#ifdef _MSC_VER + if (cpuid & XAVS2_CPU_AVX2) { + pf->sao_block8 = SAO_on_block_sse256; + } +#endif // if _MSC_VER +#endif // HAVE_MMX + } else { + pf->sao_block10 = sao_block10_c; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE4) { - pf->sao_block = SAO_on_block_sse128; + pf->sao_block10 = SAO_on_block_sse128; } #ifdef _MSC_VER if (cpuid & XAVS2_CPU_AVX2) { - pf->sao_block = SAO_on_block_sse256; + pf->sao_block10 = SAO_on_block_sse256; } #endif // if _MSC_VER #endif // HAVE_MMX + } } diff --git a/source/common/frame.c b/source/common/frame.c index 976ecc7..544cd8e 100644 --- a/source/common/frame.c +++ b/source/common/frame.c @@ -147,11 +147,12 @@ size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type) } /* compute space size and alloc memory */ + if (param->input_sample_bit_depth == 8) { mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ i_nal_info_size + /* M1, size of nal_info buffer */ cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ bs_size + /* M3, size of bitstream buffer */ - planes_size * sizeof(pel_t) + /* M4, size of planes buffer: Y+U+V */ + planes_size * sizeof(pel8_t) + /* M4, size of planes buffer: Y+U+V */ frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ #if SAVE_CU_INFO @@ -159,11 +160,28 @@ size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type) #endif (img_h_l >> MIN_CU_SIZE_IN_BIT) * sizeof(int)+ /* M8, line status array */ CACHE_LINE_SIZE * 10; + /* align to CACHE_LINE_SIZE */ + mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); + return mem_size; + } else { + mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ + i_nal_info_size + /* M1, size of nal_info buffer */ + cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ + bs_size + /* M3, size of bitstream buffer */ + planes_size * sizeof(pel10_t) + /* M4, size of planes buffer: Y+U+V */ + frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ + frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ +#if SAVE_CU_INFO + frame_size_in_mincu * sizeof(int8_t) * 3 + /* M7, size of cu mode/cbp/level buffers */ +#endif + (img_h_l >> MIN_CU_SIZE_IN_BIT) * sizeof(int)+ /* M8, line status array */ + CACHE_LINE_SIZE * 10; /* align to CACHE_LINE_SIZE */ mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); return mem_size; + } } /* --------------------------------------------------------------------------- @@ -189,7 +207,6 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) int frame_size_in_mincu = 0; #endif int frame_size_in_mvstore = 0; /* reference information size */ - uint8_t *mem_ptr; /* compute stride and the plane size */ switch (alloc_type) { @@ -236,11 +253,13 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) } /* compute space size and alloc memory */ + if (h->param->input_sample_bit_depth == 8) { + uint8_t *mem_ptr; mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ i_nal_info_size + /* M1, size of nal_info buffer */ cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ bs_size + /* M3, size of bitstream buffer */ - planes_size * sizeof(pel_t) + /* M4, size of planes buffer: Y+U+V */ + planes_size * sizeof(pel8_t) + /* M4, size of planes buffer: Y+U+V */ frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ #if SAVE_CU_INFO @@ -248,12 +267,11 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) #endif h->i_height_in_lcu * sizeof(int) + /* M8, line status array */ CACHE_LINE_SIZE * 10; - /* align to CACHE_LINE_SIZE */ mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); if (mem_base == NULL) { - CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size); + CHECKED_MALLOC8(mem_ptr, uint8_t *, mem_size); } else { mem_ptr = *mem_base; } @@ -305,54 +323,54 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) } /* M3, buffer for planes: Y+U+V */ - frame->plane_buf = (pel_t *)mem_ptr; - frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel_t); + frame->plane_buf8 = (pel8_t *)mem_ptr; + frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel8_t); - frame->planes[0] = (pel_t *)mem_ptr; - frame->planes[1] = frame->planes[0] + size_l; - frame->planes[2] = frame->planes[1] + size_c; - mem_ptr += (size_l + size_c * 2) * sizeof(pel_t); + frame->planes8[0] = (pel8_t *)mem_ptr; + frame->planes8[1] = frame->planes8[0] + size_l; + frame->planes8[2] = frame->planes8[1] + size_c; + mem_ptr += (size_l + size_c * 2) * sizeof(pel8_t); if (alloc_type == FT_DEC || alloc_type == FT_TEMP) { uint8_t *p_align; /* point to plane data area */ - frame->planes[0] += frame->i_stride[0] * (XAVS2_PAD ) + (XAVS2_PAD ); - frame->planes[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); - frame->planes[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); + frame->planes8[0] += frame->i_stride[0] * (XAVS2_PAD ) + (XAVS2_PAD ); + frame->planes8[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); + frame->planes8[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); /* make sure the pointers are aligned */ - p_align = (uint8_t *)frame->planes[0]; + p_align = (uint8_t *)frame->planes8[0]; ALIGN_POINTER(p_align); - frame->planes[0] = (pel_t *)p_align; - p_align = (uint8_t *)frame->planes[1]; + frame->planes8[0] = (pel8_t *)p_align; + p_align = (uint8_t *)frame->planes8[1]; ALIGN_POINTER(p_align); - frame->planes[1] = (pel_t *)p_align; - p_align = (uint8_t *)frame->planes[2]; + frame->planes8[1] = (pel8_t *)p_align; + p_align = (uint8_t *)frame->planes8[2]; ALIGN_POINTER(p_align); - frame->planes[2] = (pel_t *)p_align; + frame->planes8[2] = (pel8_t *)p_align; } if (alloc_type == FT_DEC) { /* buffer for luma interpolated planes */ - frame->filtered[0] = frame->planes[0]; // full pel plane, reused + frame->filtered8[0] = frame->planes8[0]; // full pel plane, reused for (i = 1; i < 16; i++) { - frame->filtered[i] = NULL; + frame->filtered8[i] = NULL; } #if ENABLE_FRAME_SUBPEL_INTPL switch (h->use_fractional_me) { case 1: - frame->filtered[2] = (pel_t *)mem_ptr; - mem_ptr += size_l * sizeof(pel_t); - frame->filtered[8] = (pel_t *)mem_ptr; - mem_ptr += size_l * sizeof(pel_t); - frame->filtered[10] = (pel_t *)mem_ptr; - mem_ptr += size_l * sizeof(pel_t); + frame->filtered8[2] = (pel8_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel8_t); + frame->filtered8[8] = (pel8_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel8_t); + frame->filtered8[10] = (pel8_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel8_t); break; case 2: for (i = 1; i < 16; i++) { - frame->filtered[i] = (pel_t *)mem_ptr; - mem_ptr += size_l * sizeof(pel_t); + frame->filtered8[i] = (pel8_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel8_t); } break; default: @@ -361,8 +379,8 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) #endif /* point to plane data area */ for (i = 1; i < 16; i++) { - if (frame->filtered[i] != NULL) { - frame->filtered[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD; + if (frame->filtered8[i] != NULL) { + frame->filtered8[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD; } } ALIGN_POINTER(mem_ptr); @@ -400,7 +418,7 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) if (mem_ptr - (uint8_t *)frame > mem_size) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to alloc one frame, type %d\n", alloc_type); - goto fail; + goto fail8; } /* update mem_base */ @@ -414,17 +432,208 @@ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) /* initialize signals */ if (xavs2_thread_mutex_init(&frame->mutex, NULL)) { - goto fail; + goto fail8; + } + if (xavs2_thread_cond_init(&frame->cond, NULL)) { + goto fail8; + } + + return frame; + +fail8: + xavs2_free(mem_ptr); + return NULL; + } else { + uint8_t *mem_ptr; + mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ + i_nal_info_size + /* M1, size of nal_info buffer */ + cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ + bs_size + /* M3, size of bitstream buffer */ + planes_size * sizeof(pel10_t) + /* M4, size of planes buffer: Y+U+V */ + frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ + frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ +#if SAVE_CU_INFO + frame_size_in_mincu * sizeof(int8_t) * 3 + /* M7, size of cu mode/cbp/level buffers */ +#endif + h->i_height_in_lcu * sizeof(int) + /* M8, line status array */ + CACHE_LINE_SIZE * 10; + /* align to CACHE_LINE_SIZE */ + mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); + + if (mem_base == NULL) { + CHECKED_MALLOC10(mem_ptr, uint8_t *, mem_size); + } else { + mem_ptr = /*(uint16_t*)*/*mem_base; + } + + /* M0, frame handle */ + frame = (xavs2_frame_t *)mem_ptr; + mem_ptr += sizeof(xavs2_frame_t); + ALIGN_POINTER(mem_ptr); + + /* set frame properties */ + frame->i_plane = 3; /* planes: Y+U+V */ + frame->i_width [0] = img_w_l; + frame->i_lines [0] = img_h_l; + frame->i_stride[0] = stride_l; + frame->i_width [1] = frame->i_width [2] = img_w_c; + frame->i_lines [1] = frame->i_lines [2] = img_h_c; + frame->i_stride[1] = frame->i_stride[2] = stride_c; + + /* the default setting of a frame */ + frame->i_frame = -1; + frame->i_frm_coi = -1; + frame->i_gop_idr_coi = -1; + + if (h->param->chroma_format == CHROMA_400) { + frame->i_plane = 1; + } + + frame->i_frm_type = XAVS2_TYPE_AUTO; + frame->i_pts = -1; + frame->i_dts = -1; + frame->b_enable_intra = (h->param->enable_intra); + + /* buffer for fenc */ + if (alloc_type == FT_ENC) { +#if XAVS2_ADAPT_LAYER + /* M1, nal_info buffer */ + frame->nal_info = (xavs2_nal_info_t *)mem_ptr; + frame->i_nal = 0; + mem_ptr += i_nal_info_size; + ALIGN_POINTER(mem_ptr); +#endif + + /* M2, set the bit stream buffer pointer and length + * NOTE: the size of bitstream buffer is big enough, no need to reallocate + * memory in function encoder_encapsulate_nals */ + frame->p_bs_buf = mem_ptr; + frame->i_bs_buf = bs_size; /* the length is long enough */ + mem_ptr += bs_size; + } + + /* M3, buffer for planes: Y+U+V */ + frame->plane_buf10 = (pel10_t *)mem_ptr; + frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel10_t); + + frame->planes10[0] = (pel10_t *)mem_ptr; + frame->planes10[1] = frame->planes10[0] + size_l; + frame->planes10[2] = frame->planes10[1] + size_c; + mem_ptr += (size_l + size_c * 2) * sizeof(pel10_t); + + if (alloc_type == FT_DEC || alloc_type == FT_TEMP) { + uint8_t *p_align; + /* point to plane data area */ + frame->planes10[0] += frame->i_stride[0] * (XAVS2_PAD ) + (XAVS2_PAD ); + frame->planes10[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); + frame->planes10[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); + + /* make sure the pointers are aligned */ + p_align = (uint8_t *)frame->planes10[0]; + ALIGN_POINTER(p_align); + frame->planes10[0] = (pel10_t *)p_align; + p_align = (uint8_t *)frame->planes10[1]; + ALIGN_POINTER(p_align); + frame->planes10[1] = (pel10_t *)p_align; + p_align = (uint8_t *)frame->planes10[2]; + ALIGN_POINTER(p_align); + frame->planes10[2] = (pel10_t *)p_align; + } + + if (alloc_type == FT_DEC) { + /* buffer for luma interpolated planes */ + frame->filtered10[0] = frame->planes10[0]; // full pel plane, reused + for (i = 1; i < 16; i++) { + frame->filtered10[i] = NULL; + } +#if ENABLE_FRAME_SUBPEL_INTPL + switch (h->use_fractional_me) { + case 1: + frame->filtered10[2] = (pel10_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel10_t); + frame->filtered10[8] = (pel10_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel10_t); + frame->filtered10[10] = (pel10_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel10_t); + + break; + case 2: + for (i = 1; i < 16; i++) { + frame->filtered10[i] = (pel10_t *)mem_ptr; + mem_ptr += size_l * sizeof(pel10_t); + } + break; + default: + break; + } +#endif + /* point to plane data area */ + for (i = 1; i < 16; i++) { + if (frame->filtered10[i] != NULL) { + frame->filtered10[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD; + } + } + ALIGN_POINTER(mem_ptr); + + /* M4, reference index buffer */ + frame->pu_ref = (int8_t *)mem_ptr; + mem_ptr += frame_size_in_mvstore * sizeof(int8_t); + ALIGN_POINTER(mem_ptr); + + /* M5, pu motion vector buffer */ + frame->pu_mv = (mv_t *)mem_ptr; + mem_ptr += frame_size_in_mvstore * sizeof(mv_t); + ALIGN_POINTER(mem_ptr); + +#if SAVE_CU_INFO + /* M6, cu mode/cbp/level buffers */ + frame->cu_mode = (int8_t *)mem_ptr; + mem_ptr += frame_size_in_mincu * sizeof(int8_t); + ALIGN_POINTER(mem_ptr); + frame->cu_cbp = (int8_t *)mem_ptr; + mem_ptr += frame_size_in_mincu * sizeof(int8_t); + ALIGN_POINTER(mem_ptr); + frame->cu_level = (int8_t *)mem_ptr; + mem_ptr += frame_size_in_mincu * sizeof(int8_t); + ALIGN_POINTER(mem_ptr); +#endif + + /* M7, line status array */ + frame->num_lcu_coded_in_row = (int *)mem_ptr; + mem_ptr += h->i_height_in_lcu * sizeof(int); + ALIGN_POINTER(mem_ptr); + + memset(frame->num_lcu_sao_off, 0, sizeof(frame->num_lcu_sao_off)); + } + + if (mem_ptr - (uint8_t *)frame > mem_size) { + xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to alloc one frame, type %d\n", alloc_type); + goto fail10; + } + + /* update mem_base */ + if (mem_base != NULL) { + *mem_base = /*(uint8_t**)*/mem_ptr; + } + + /* initialize default value */ + frame->i_qpplus1 = 0; + frame->cnt_refered = 0; + + /* initialize signals */ + if (xavs2_thread_mutex_init(&frame->mutex, NULL)) { + goto fail10; } if (xavs2_thread_cond_init(&frame->cond, NULL)) { - goto fail; + goto fail10; } return frame; -fail: +fail10: xavs2_free(mem_ptr); return NULL; + } } /* --------------------------------------------------------------------------- @@ -468,11 +677,50 @@ void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) /* --------------------------------------------------------------------------- */ void -plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, +plane_expand_border8(pel8_t *p_pix, int i_stride, int i_width, int i_height, + int i_padh, int i_padv, int b_pad_top, int b_pad_bottom) +{ + pel8_t *pix = p_pix; + pel8_t *row; + int y; + + /* --- horizontal ---------------------------------------------- + */ + for (y = 0; y < i_height; y++) { + g_funcs.mem_repeat_p(pix - i_padh, pix[0 ], i_padh); /* left band */ + g_funcs.mem_repeat_p(pix + i_width, pix[i_width - 1], i_padh); /* right band */ + pix += i_stride; + } + + /* --- vertical ------------------------------------------------ + */ + i_width += (i_padh << 1); + + /* upper band */ + if (b_pad_top) { + pix = row = p_pix - i_padh; /* start row position */ + for (y = 0; y < i_padv; y++) { + pix -= i_stride; + memcpy(pix, row, i_width * sizeof(pel8_t)); + } + } + + /* lower band */ + if (b_pad_bottom) { + pix = row = p_pix + (i_height - 1) * i_stride - i_padh; + for (y = 0; y < i_padv; y++) { + pix += i_stride; + memcpy(pix, row, i_width * sizeof(pel8_t)); + } + } +} + +void +plane_expand_border10(pel10_t *p_pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom) { - pel_t *pix = p_pix; - pel_t *row; + pel10_t *pix = p_pix; + pel10_t *row; int y; /* --- horizontal ---------------------------------------------- @@ -492,7 +740,7 @@ plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, pix = row = p_pix - i_padh; /* start row position */ for (y = 0; y < i_padv; y++) { pix -= i_stride; - memcpy(pix, row, i_width * sizeof(pel_t)); + memcpy(pix, row, i_width * sizeof(pel10_t)); } } @@ -501,7 +749,7 @@ plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, pix = row = p_pix + (i_height - 1) * i_stride - i_padh; for (y = 0; y < i_padv; y++) { pix += i_stride; - memcpy(pix, row, i_width * sizeof(pel_t)); + memcpy(pix, row, i_width * sizeof(pel10_t)); } } } @@ -515,9 +763,10 @@ void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame) int b_frame_start = 1; int b_frame_end = 1; int i; - pel_t *pix; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *pix; - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); for (i = 0; i < frame->i_plane; i++) { int chroma = !!i; @@ -527,8 +776,27 @@ void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame) int pad_h = XAVS2_PAD >> chroma; int pad_v = XAVS2_PAD >> chroma; - pix = frame->planes[i] + (slice_start_y >> chroma) * stride; - plane_expand_border(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end); + pix = frame->planes8[i] + (slice_start_y >> chroma) * stride; + + plane_expand_border8(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end); + } + } else { + pel10_t *pix; + + //UNUSED_PARAMETER(h); + + for (i = 0; i < frame->i_plane; i++) { + int chroma = !!i; + int stride = frame->i_stride[i]; + int width = frame->i_width[i]; + int height = slice_height >> chroma; + int pad_h = XAVS2_PAD >> chroma; + int pad_v = XAVS2_PAD >> chroma; + + pix = frame->planes10[i] + (slice_start_y >> chroma) * stride; + + plane_expand_border10(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end); + } } } @@ -553,7 +821,8 @@ void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lc int y_start = ((i_lcu_y + 0) << (i_lcu_level - chroma_shift)); int y_end = ((i_lcu_y + 1) << (i_lcu_level - chroma_shift)); int height; - pel_t *pix; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *pix; if (i_lcu_y != h->slices[h->i_slice_index]->i_first_lcu_y) { y_start -= UP_SHIFT; @@ -569,8 +838,30 @@ void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lc // h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end); // } - pix = frame->planes[i] + y_start * stride; - plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end); + pix = frame->planes8[i] + y_start * stride; + + plane_expand_border8(pix, stride, width, height, padh, padv, b_start, b_end); + } else { + pel10_t *pix; + + if (i_lcu_y != h->slices[h->i_slice_index]->i_first_lcu_y) { + y_start -= UP_SHIFT; + } + if (i_lcu_y != h->slices[h->i_slice_index]->i_last_lcu_y) { + y_end -= UP_SHIFT; + } + + y_end = XAVS2_MIN(frame->i_lines[i], y_end); + height = y_end - y_start; + // if (i == 0) { + // xavs2_log(NULL, XAVS2_LOG_DEBUG, "Pad POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n", + // h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end); + // } + + pix = frame->planes10[i] + y_start * stride; + + plane_expand_border10(pix, stride, width, height, padh, padv, b_start, b_end); + } } } @@ -588,9 +879,29 @@ void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame) int i_pady = (h->i_height - h->param->org_height) >> i_scale; int i_stride = frame->i_stride[i]; + if (h->param->input_sample_bit_depth == 8) { /* expand right border */ if (i_padx) { - pel_t *pix = frame->planes[i] + i_width; + pel8_t *pix = frame->planes8[i] + i_width; + for (y = 0; y < i_height; y++) { + memset(pix, pix[-1], i_padx); + pix += i_stride; + } + } + + /* expand bottom border */ + if (i_pady) { + int rowlen = (i_width + i_padx) * sizeof(pel8_t); + pel8_t *row = frame->planes8[i] + (i_height - 1) * i_stride; + pel8_t *pix = frame->planes8[i] + (i_height ) * i_stride; + for (y = i_height; y < i_height + i_pady; y++) { + memcpy(pix, row, rowlen); + pix += i_stride; + } + } + } else { + if (i_padx) { + pel10_t *pix = frame->planes10[i] + i_width; for (y = 0; y < i_height; y++) { memset(pix, pix[-1], i_padx); pix += i_stride; @@ -599,32 +910,43 @@ void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame) /* expand bottom border */ if (i_pady) { - int rowlen = (i_width + i_padx) * sizeof(pel_t); - pel_t *row = frame->planes[i] + (i_height - 1) * i_stride; - pel_t *pix = frame->planes[i] + (i_height ) * i_stride; + int rowlen = (i_width + i_padx) * sizeof(pel10_t); + pel10_t *row = frame->planes10[i] + (i_height - 1) * i_stride; + pel10_t *pix = frame->planes10[i] + (i_height ) * i_stride; for (y = i_height; y < i_height + i_pady; y++) { memcpy(pix, row, rowlen); pix += i_stride; } } + } } } /* --------------------------------------------------------------------------- - * FIXME: 还需要考虑padding区域的拷贝 + * FIXME: 杩橀渶瑕佽冭檻padding鍖哄煙鐨勬嫹璐 */ void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src) { int k; - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); if (dst->size_plane_buf == src->size_plane_buf && dst->i_width[0] == src->i_width[0]) { - g_funcs.fast_memcpy(dst->plane_buf, src->plane_buf, src->size_plane_buf); + if (h->param->input_sample_bit_depth == 8) { + g_funcs.fast_memcpy(dst->plane_buf8, src->plane_buf8, src->size_plane_buf); + } else { + g_funcs.fast_memcpy(dst->plane_buf10, src->plane_buf10, src->size_plane_buf); + } } else { for (k = 0; k < dst->i_plane; k++) { - g_funcs.plane_copy(dst->planes[k], dst->i_stride[k], - src->planes[k], src->i_stride[k], + if (h->param->input_sample_bit_depth == 8) { + g_funcs.plane_copy8(h, dst->planes8[k], dst->i_stride[k], + src->planes8[k], src->i_stride[k], src->i_width[k], src->i_lines[k]); + } else { + g_funcs.plane_copy10(h, dst->planes10[k], dst->i_stride[k], + src->planes10[k], src->i_stride[k], + src->i_width[k], src->i_lines[k]); + } } } } diff --git a/source/common/frame.h b/source/common/frame.h index c7a1f15..376cb81 100644 --- a/source/common/frame.h +++ b/source/common/frame.h @@ -58,7 +58,9 @@ void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame); void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src); #define xavs2_frame_expand_border_frame FPFX(frame_expand_border_frame) -void plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, +void plane_expand_border8(pel8_t *p_pix, int i_stride, int i_width, int i_height, + int i_padh, int i_padv, int b_pad_top, int b_pad_bottom); +void plane_expand_border10(pel10_t *p_pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom); void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame); #define xavs2_frame_expand_border_lcurow FPFX(frame_expand_border_lcurow) diff --git a/source/common/intra.c b/source/common/intra.c index 76279fb..f27f95e 100644 --- a/source/common/intra.c +++ b/source/common/intra.c @@ -124,22 +124,43 @@ static const char tab_auc_dir_dxdy[2][NUM_INTRA_MODE][2] = { /* --------------------------------------------------------------------------- */ -static void intra_pred_ver_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ver8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - pel_t *p_src = src + 1; + pel8_t *p_src = src + 1; int y; for (y = 0; y < bsy; y++) { - g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel8_t)); + dst += i_dst; + } +} + +static void intra_pred_ver10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + pel10_t *p_src = src + 1; + int y; + + for (y = 0; y < bsy; y++) { + g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel10_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_hor8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + pel8_t *p_src = src - 1; + + while (bsy-- != 0) { + g_funcs.mem_repeat_p(dst, *p_src--, bsx); + dst += i_dst; + } +} + +static void intra_pred_hor10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - pel_t *p_src = src - 1; + pel10_t *p_src = src - 1; while (bsy-- != 0) { g_funcs.mem_repeat_p(dst, *p_src--, bsx); @@ -150,7 +171,44 @@ static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in /* --------------------------------------------------------------------------- * NOTE: dir_mode = (bAboveAvail << 8) + (bLeftAvail) */ -static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_dc8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int avail_top = dir_mode >> 8; + int avail_left = dir_mode & 0xFF; + int sum_left = 0; + int sum_above = 0; + int dc_value; + int x, y; + pel8_t *p_src; + + p_src = src - 1; + for (y = 0; y < bsy; y++) { + sum_left += p_src[-y]; + } + + p_src = src + 1; + for (x = 0; x < bsx; x++) { + sum_above += p_src[x]; + } + + if (avail_left && avail_top) { + x = bsx + bsy; + dc_value = ((sum_left + sum_above + (x >> 1)) * (512 / x)) >> 9; + } else if (avail_left) { + dc_value = (sum_left + (bsy >> 1)) >> xavs2_log2u(bsy); + } else if (avail_top) { + dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx); + } else { + dc_value = ((1 << h->param->input_sample_bit_depth) >> 1); + } + + for (y = 0; y < bsy; y++) { + g_funcs.mem_repeat_p(dst, (pel8_t)dc_value, bsx); + dst += i_dst; + } +} + +static void intra_pred_dc10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int avail_top = dir_mode >> 8; int avail_left = dir_mode & 0xFF; @@ -158,7 +216,7 @@ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int int sum_above = 0; int dc_value; int x, y; - pel_t *p_src; + pel10_t *p_src; p_src = src - 1; for (y = 0; y < bsy; y++) { @@ -178,18 +236,62 @@ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int } else if (avail_top) { dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx); } else { - dc_value = g_dc_value; + dc_value = ((1 << h->param->input_sample_bit_depth) >> 1); } for (y = 0; y < bsy; y++) { - g_funcs.mem_repeat_p(dst, (pel_t)dc_value, bsx); + g_funcs.mem_repeat_p(dst, (pel10_t)dc_value, bsx); dst += i_dst; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_plane8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + /* size in bits: 2 3 4 5 6 */ + static const int ib_mult [8] = { 0, 0, 13, 17, 5, 11, 23, 0 }; + static const int ib_shift[8] = { 0, 0, 7, 10, 11, 15, 19, 0 }; + const int mult_h = ib_mult [tab_log2size[bsx]]; + const int mult_v = ib_mult [tab_log2size[bsy]]; + const int shift_h = ib_shift[tab_log2size[bsx]]; + const int shift_v = ib_shift[tab_log2size[bsy]]; + const int W2 = bsx >> 1; /* half block width */ + const int H2 = bsy >> 1; /* half block height */ + const int vmax = (1 << h->param->input_sample_bit_depth) - 1; /* max value of pixel */ + int H = 0; + int V = 0; + int a, b, c; + int x, y; + pel8_t *p_src; + + /* calculate H and V */ + p_src = src + W2; + for (x = 1; x < W2 + 1; x++) { + H += x * (p_src[x] - p_src[-x]); + } + p_src = src - H2; + for (y = 1; y < H2 + 1; y++) { + V += y * (p_src[-y] - p_src[y]); + } + + a = (src[-bsy] + src[bsx]) << 4; + b = ((H << 5) * mult_h + (1 << (shift_h - 1))) >> shift_h; + c = ((V << 5) * mult_v + (1 << (shift_v - 1))) >> shift_v; + a += 16 - b * (W2 - 1) - c * (H2 - 1); + + for (y = 0; y < bsy; y++) { + int pix = a; + for (x = 0; x < bsx; x++) { + dst[x] = (pel8_t)XAVS2_CLIP3(0, vmax, pix >> 5); + pix += b; + } + dst += i_dst; + a += c; + } +} + +static void intra_pred_plane10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { /* size in bits: 2 3 4 5 6 */ static const int ib_mult [8] = { 0, 0, 13, 17, 5, 11, 23, 0 }; @@ -200,12 +302,12 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, const int shift_v = ib_shift[tab_log2size[bsy]]; const int W2 = bsx >> 1; /* half block width */ const int H2 = bsy >> 1; /* half block height */ - const int vmax = (1 << g_bit_depth) - 1; /* max value of pixel */ + const int vmax = (1 << h->param->input_sample_bit_depth) - 1; /* max value of pixel */ int H = 0; int V = 0; int a, b, c; int x, y; - pel_t *p_src; + pel10_t *p_src; /* calculate H and V */ p_src = src + W2; @@ -225,7 +327,7 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, for (y = 0; y < bsy; y++) { int pix = a; for (x = 0; x < bsx; x++) { - dst[x] = (pel_t)XAVS2_CLIP3(0, vmax, pix >> 5); + dst[x] = (pel10_t)XAVS2_CLIP3(0, vmax, pix >> 5); pix += b; } dst += i_dst; @@ -235,19 +337,74 @@ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_bilinear8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + itr8_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE]; + int shift_x = tab_log2size[bsx]; + int shift_y = tab_log2size[bsy]; + int shift = XAVS2_MIN(shift_x, shift_y); + int shift_xy = shift_x + shift_y + 1; + int offset = 1 << (shift_x + shift_y); + int vmax = (1 << h->param->input_sample_bit_depth) - 1;; // max value of pixel + int a, b, c, t, wxy, temp; + int predx, val; + int x, y; + pel8_t *p_src; + + p_src = src + 1; + for (x = 0; x < bsx; x++) { + pTop[x] = p_src[x]; + } + p_src = src - 1; + for (y = 0; y < bsy; y++) { + pLeft[y] = p_src[-y]; + } + + a = pTop [bsx - 1]; + b = pLeft[bsy - 1]; + c = (bsx == bsy) ? (a + b + 1) >> 1 : (((a << shift_x) + (b << shift_y)) * 13 + (1 << (shift + 5))) >> (shift + 6); + t = (c << 1) - a - b; + + for (x = 0; x < bsx; x++) { + pT [x] = (itr8_t)(b - pTop[x]); + pTop[x] <<= shift_y; + } + + temp = 0; + for (y = 0; y < bsy; y++) { + pL [y] = (itr8_t)(a - pLeft[y]); + pLeft[y] <<= shift_x; + wy [y] = (itr8_t)temp; + temp += t; + } + + for (y = 0; y < bsy; y++) { + predx = pLeft[y]; + wxy = -wy[y]; + for (x = 0; x < bsx; x++) { + predx += pL[y]; + wxy += wy[y]; + pTop[x] += pT[x]; + val = ((predx << shift_y) + (pTop[x] << shift_x) + wxy + offset) >> shift_xy; + dst[x] = (pel8_t)XAVS2_CLIP3(0, vmax, val); + } + dst += i_dst; + } +} + +static void intra_pred_bilinear10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - itr_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE]; + itr10_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE]; int shift_x = tab_log2size[bsx]; int shift_y = tab_log2size[bsy]; int shift = XAVS2_MIN(shift_x, shift_y); int shift_xy = shift_x + shift_y + 1; int offset = 1 << (shift_x + shift_y); - int vmax = max_pel_value; // max value of pixel + int vmax = (1 << h->param->input_sample_bit_depth) - 1;; // max value of pixel int a, b, c, t, wxy, temp; int predx, val; int x, y; - pel_t *p_src; + pel10_t *p_src; p_src = src + 1; for (x = 0; x < bsx; x++) { @@ -264,15 +421,15 @@ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod t = (c << 1) - a - b; for (x = 0; x < bsx; x++) { - pT [x] = (itr_t)(b - pTop[x]); + pT [x] = (itr10_t)(b - pTop[x]); pTop[x] <<= shift_y; } temp = 0; for (y = 0; y < bsy; y++) { - pL [y] = (itr_t)(a - pLeft[y]); + pL [y] = (itr10_t)(a - pLeft[y]); pLeft[y] <<= shift_x; - wy [y] = (itr_t)temp; + wy [y] = (itr10_t)temp; temp += t; } @@ -284,7 +441,7 @@ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod wxy += wy[y]; pTop[x] += pT[x]; val = ((predx << shift_y) + (pTop[x] << shift_x) + wxy + offset) >> shift_xy; - dst[x] = (pel_t)XAVS2_CLIP3(0, vmax, val); + dst[x] = (pel10_t)XAVS2_CLIP3(0, vmax, val); } dst += i_dst; } @@ -307,7 +464,31 @@ static ALWAYS_INLINE int get_context_pixel(int dir_mode, int xy_flag, int temp_d /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int f0, f1, f2, f3; + int i, j; + int iX; + + for (j = 0; j < bsy; j++) { + iX = get_context_pixel(dir_mode, 0, j + 1, &f3); + f0 = 32 - f3; + f1 = 64 - f3; + f2 = 32 + f3; + + for (i = 0; i < bsx; i++) { + dst[i] = (pel8_t)((src[iX ] * f0 + + src[iX + 1] * f1 + + src[iX + 2] * f2 + + src[iX + 3] * f3 + 64) >> 7); + iX++; + } + + dst += i_dst; + } +} + +static void intra_pred_ang10_x_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int f0, f1, f2, f3; int i, j; @@ -320,7 +501,7 @@ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, f2 = 32 + f3; for (i = 0; i < bsx; i++) { - dst[i] = (pel_t)((src[iX ] * f0 + + dst[i] = (pel10_t)((src[iX ] * f0 + src[iX + 1] * f1 + src[iX + 2] * f2 + src[iX + 3] * f3 + 64) >> 7); @@ -333,7 +514,32 @@ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int offsets[64]; + int xsteps[64]; + int offset; + int i, j; + int iY; + + for (i = 0; i < bsx; i++) { + xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &offsets[i]); + } + + for (j = 0; j < bsy; j++) { + for (i = 0; i < bsx; i++) { + iY = j + xsteps[i]; + offset = offsets[i]; + dst[i] = (pel8_t)((src[-iY ] * (32 - offset) + + src[-iY - 1] * (64 - offset) + + src[-iY - 2] * (32 + offset) + + src[-iY - 3] * ( offset) + 64) >> 7); + } + dst += i_dst; + } +} + +static void intra_pred_ang10_y_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int offsets[64]; int xsteps[64]; @@ -349,7 +555,7 @@ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, for (i = 0; i < bsx; i++) { iY = j + xsteps[i]; offset = offsets[i]; - dst[i] = (pel_t)((src[-iY ] * (32 - offset) + + dst[i] = (pel10_t)((src[-iY ] * (32 - offset) + src[-iY - 1] * (64 - offset) + src[-iY - 2] * (32 + offset) + src[-iY - 3] * ( offset) + 64) >> 7); @@ -360,7 +566,7 @@ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(int xoffsets[64]); ALIGN16(int xsteps[64]); @@ -378,13 +584,13 @@ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, iYy = j - xsteps[i]; if (iYy <= -1) { - dst[i] = (pel_t)((src[ iXx + 2] * (32 - offsetx) + + dst[i] = (pel8_t)((src[ iXx + 2] * (32 - offsetx) + src[ iXx + 1] * (64 - offsetx) + src[ iXx ] * (32 + offsetx) + src[ iXx - 1] * ( offsetx) + 64) >> 7); } else { offsety = xoffsets[i]; - dst[i] = (pel_t)((src[-iYy - 2] * (32 - offsety) + + dst[i] = (pel8_t)((src[-iYy - 2] * (32 - offsety) + src[-iYy - 1] * (64 - offsety) + src[-iYy ] * (32 + offsety) + src[-iYy + 1] * ( offsety) + 64) >> 7); @@ -395,18 +601,84 @@ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, } } +static void intra_pred_ang10_xy_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(int xoffsets[64]); + ALIGN16(int xsteps[64]); + int i, j, iXx, iYy; + int offsetx, offsety; + + for (i = 0; i < bsx; i++) { + xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &xoffsets[i]); + } + + for (j = 0; j < bsy; j++) { + iXx = -get_context_pixel(dir_mode, 0, j + 1, &offsetx); + + for (i = 0; i < bsx; i++) { + iYy = j - xsteps[i]; + + if (iYy <= -1) { + dst[i] = (pel10_t)((src[ iXx + 2] * (32 - offsetx) + + src[ iXx + 1] * (64 - offsetx) + + src[ iXx ] * (32 + offsetx) + + src[ iXx - 1] * ( offsetx) + 64) >> 7); + } else { + offsety = xoffsets[i]; + dst[i] = (pel10_t)((src[-iYy - 2] * (32 - offsety) + + src[-iYy - 1] * (64 - offsety) + + src[-iYy ] * (32 + offsety) + + src[-iYy + 1] * ( offsety) + 64) >> 7); + } + iXx++; + } + dst += i_dst; + } +} /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_3_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[(64 + 176) << 2]); + int line_size = bsx + (bsy >> 2) * 11 - 1; + + int aligned_line_size = 64 + 176; + int i_dst4 = i_dst << 2; + int i; + pel8_t *pfirst[4]; + + pfirst[0] = first_line; + pfirst[1] = pfirst[0] + aligned_line_size; + pfirst[2] = pfirst[1] + aligned_line_size; + pfirst[3] = pfirst[2] + aligned_line_size; + + for (i = 0; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + pfirst[1][i] = (pel8_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + pfirst[2][i] = (pel8_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + pfirst[3][i] = (pel8_t)(( src[11] + 2 * src[12] + src[13] + 0 * src[14] + 2) >> 2); + } + + bsy >>= 2; + for (i = 0; i < bsy; i++) { + memcpy(dst , pfirst[0] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel8_t)); + dst += i_dst4; + } +} + +static void intra_pred_ang10_x_3_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[(64 + 176) << 2]); + ALIGN16(pel10_t first_line[(64 + 176) << 2]); int line_size = bsx + (bsy >> 2) * 11 - 1; int aligned_line_size = 64 + 176; int i_dst4 = i_dst << 2; int i; - pel_t *pfirst[4]; + pel10_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; @@ -414,62 +686,80 @@ static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode pfirst[3] = pfirst[2] + aligned_line_size; for (i = 0; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - pfirst[1][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); - pfirst[2][i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); - pfirst[3][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 0 * src[14] + 2) >> 2); + pfirst[0][i] = (pel10_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + pfirst[1][i] = (pel10_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + pfirst[2][i] = (pel10_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + pfirst[3][i] = (pel10_t)(( src[11] + 2 * src[12] + src[13] + 0 * src[14] + 2) >> 2); } bsy >>= 2; for (i = 0; i < bsy; i++) { - memcpy(dst , pfirst[0] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel_t)); + memcpy(dst , pfirst[0] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel10_t)); dst += i_dst4; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_4_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_4_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[64 + 128]); + int line_size = bsx + ((bsy - 1) << 1); + int iHeight2 = bsy << 1; + int i; + + src += 3; + for (i = 0; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); + } + + for (i = 0; i < iHeight2; i += 2) { + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); + dst += i_dst; + } +} + +static void intra_pred_ang10_x_4_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[64 + 128]); + ALIGN16(pel10_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; src += 3; for (i = 0; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); + first_line[i] = (pel10_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } for (i = 0; i < iHeight2; i += 2) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_5_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (((bsy > 4) && (bsx > 8))) { - ALIGN16(pel_t first_line[(64 + 80) << 3]); + ALIGN16(pel8_t first_line[(64 + 80) << 3]); int line_size = bsx + (((bsy - 8) * 11) >> 3); int aligned_line_size = ((line_size + 15) >> 4) << 4; - pel_t *pfirst[8]; + pel8_t *pfirst[8]; - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; @@ -481,27 +771,27 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode pfirst[7] = pfirst[6] + aligned_line_size; for (i = 0; i < line_size; src++, i++) { - pfirst[0][i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); - pfirst[1][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - pfirst[2][i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); - pfirst[3][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + pfirst[0][i] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + pfirst[1][i] = (pel8_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + pfirst[2][i] = (pel8_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + pfirst[3][i] = (pel8_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); - pfirst[4][i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); - pfirst[5][i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); - pfirst[6][i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); - pfirst[7][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); + pfirst[4][i] = (pel8_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); + pfirst[5][i] = (pel8_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + pfirst[6][i] = (pel8_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); + pfirst[7][i] = (pel8_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } bsy >>= 3; for (i = 0; i < bsy; i++) { - memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); - memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); + memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel8_t)); + memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel8_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; @@ -513,41 +803,41 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode dst8 = dst7 + i_dst; } } else if (bsx == 16) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); - dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); - dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + dst1[i] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel8_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel8_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel8_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } } else if (bsx == 8) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; for (i = 0; i < 8; src++, i++) { - dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); - dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); - dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + dst1[i] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel8_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel8_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel8_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); - dst5[i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); - dst6[i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); - dst7[i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); - dst8[i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); + dst5[i] = (pel8_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); + dst6[i] = (pel8_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + dst7[i] = (pel8_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); + dst8[i] = (pel8_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } if (bsy == 32) { //src -> 8,src[8] -> 16 - pel_t pad1 = src[8]; + pel8_t pad1 = src[8]; dst1 = dst8 + i_dst; int j; for (j = 0; j < 24; j++) { @@ -562,32 +852,32 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode dst3 = dst2 + i_dst; src += 4; - dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); - dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); - dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); - dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); - dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); - dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); - dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); + dst1[0] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst1[1] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst1[2] = (pel8_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); + dst1[3] = (pel8_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); + dst2[0] = (pel8_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); + dst2[1] = (pel8_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst2[2] = (pel8_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); + dst3[0] = (pel8_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); } } else { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; for (i = 0; i < 4; i++, src++) { - dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); - dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); - dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); - dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + dst1[i] = (pel8_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel8_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel8_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel8_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } if (bsy == 16) { - pel_t *dst5 = dst4 + i_dst; + pel8_t *dst5 = dst4 + i_dst; src += 4; - pel_t pad1 = src[0]; + pel8_t pad1 = src[0]; int j; for (j = 0; j < 12; j++) { @@ -597,107 +887,332 @@ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode dst5 += i_dst; } dst5 = dst4 + i_dst; - dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); - dst5[1] = (pel_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5); + dst5[0] = (pel8_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); + dst5[1] = (pel8_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5); } } } -/* --------------------------------------------------------------------------- - */ -static void intra_pred_ang_x_6_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang10_x_5_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[64 + 64]); - int line_size = bsx + bsy - 1; int i; - for (i = 0; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); - } + if (((bsy > 4) && (bsx > 8))) { + ALIGN16(pel10_t first_line[(64 + 80) << 3]); + int line_size = bsx + (((bsy - 8) * 11) >> 3); + int aligned_line_size = ((line_size + 15) >> 4) << 4; + pel10_t *pfirst[8]; - for (i = 0; i < bsy; i++) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + + pfirst[0] = first_line; + pfirst[1] = pfirst[0] + aligned_line_size; + pfirst[2] = pfirst[1] + aligned_line_size; + pfirst[3] = pfirst[2] + aligned_line_size; + pfirst[4] = pfirst[3] + aligned_line_size; + pfirst[5] = pfirst[4] + aligned_line_size; + pfirst[6] = pfirst[5] + aligned_line_size; + pfirst[7] = pfirst[6] + aligned_line_size; + + for (i = 0; i < line_size; src++, i++) { + pfirst[0][i] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + pfirst[1][i] = (pel10_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + pfirst[2][i] = (pel10_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + pfirst[3][i] = (pel10_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + + pfirst[4][i] = (pel10_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); + pfirst[5][i] = (pel10_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + pfirst[6][i] = (pel10_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); + pfirst[7][i] = (pel10_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); + } + + bsy >>= 3; + for (i = 0; i < bsy; i++) { + memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel10_t)); + memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel10_t)); + + dst1 = dst8 + i_dst; + dst2 = dst1 + i_dst; + dst3 = dst2 + i_dst; + dst4 = dst3 + i_dst; + dst5 = dst4 + i_dst; + dst6 = dst5 + i_dst; + dst7 = dst6 + i_dst; + dst8 = dst7 + i_dst; + } + } else if (bsx == 16) { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + + for (i = 0; i < bsx; i++, src++) { + dst1[i] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel10_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel10_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel10_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + } + } else if (bsx == 8) { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + + for (i = 0; i < 8; src++, i++) { + dst1[i] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel10_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel10_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel10_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + + dst5[i] = (pel10_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); + dst6[i] = (pel10_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); + dst7[i] = (pel10_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); + dst8[i] = (pel10_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); + } + if (bsy == 32) { + //src -> 8,src[8] -> 16 + pel10_t pad1 = src[8]; + dst1 = dst8 + i_dst; + int j; + for (j = 0; j < 24; j++) { + for (i = 0; i < 8; i++) { + dst1[i] = pad1; + } + dst1 += i_dst; + } + + dst1 = dst8 + i_dst; + dst2 = dst1 + i_dst; + dst3 = dst2 + i_dst; + + src += 4; + dst1[0] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst1[1] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst1[2] = (pel10_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); + dst1[3] = (pel10_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); + dst2[0] = (pel10_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); + dst2[1] = (pel10_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst2[2] = (pel10_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); + dst3[0] = (pel10_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); + } + } else { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + + for (i = 0; i < 4; i++, src++) { + dst1[i] = (pel10_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); + dst2[i] = (pel10_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); + dst3[i] = (pel10_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); + dst4[i] = (pel10_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); + } + if (bsy == 16) { + pel10_t *dst5 = dst4 + i_dst; + + src += 4; + pel10_t pad1 = src[0]; + + int j; + for (j = 0; j < 12; j++) { + for (i = 0; i < 4; i++) { + dst5[i] = pad1; + } + dst5 += i_dst; + } + dst5 = dst4 + i_dst; + dst5[0] = (pel10_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); + dst5[1] = (pel10_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5); + } + } +} + +/* --------------------------------------------------------------------------- + */ +static void intra_pred_ang8_x_6_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[64 + 64]); + int line_size = bsx + bsy - 1; + int i; + + for (i = 0; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); + dst += i_dst; + } +} + +static void intra_pred_ang10_x_6_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel10_t first_line[64 + 64]); + int line_size = bsx + bsy - 1; + int i; + + for (i = 0; i < line_size; i++, src++) { + first_line[i] = (pel10_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_7_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_7_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; if (bsy == 4) { for (i = 0; i < bsx; src++, i++) { - dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); - dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); - dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); - dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); + dst1[i] = (pel8_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); + dst2[i] = (pel8_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); + dst3[i] = (pel8_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); + dst4[i] = (pel8_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); } } else if (bsy == 8) { - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; src++, i++) { - dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); - dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); - dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); - dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); - dst5[i] = (pel_t)((src[3] * 3 + src[4] * 11 + src[5] * 13 + src[6] * 5 + 16) >> 5); - dst6[i] = (pel_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7); - dst7[i] = (pel_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32) >> 6); - dst8[i] = (pel_t)((src[5] * 3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6); + dst1[i] = (pel8_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); + dst2[i] = (pel8_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); + dst3[i] = (pel8_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); + dst4[i] = (pel8_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); + dst5[i] = (pel8_t)((src[3] * 3 + src[4] * 11 + src[5] * 13 + src[6] * 5 + 16) >> 5); + dst6[i] = (pel8_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7); + dst7[i] = (pel8_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32) >> 6); + dst8[i] = (pel8_t)((src[5] * 3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6); } } else { - intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); + intra_pred_ang8_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy); + } +} + +static void intra_pred_ang10_x_7_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + if (bsy == 4) { + for (i = 0; i < bsx; src++, i++) { + dst1[i] = (pel10_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); + dst2[i] = (pel10_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); + dst3[i] = (pel10_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); + dst4[i] = (pel10_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); + } + } else if (bsy == 8) { + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + for (i = 0; i < bsx; src++, i++) { + dst1[i] = (pel10_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); + dst2[i] = (pel10_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); + dst3[i] = (pel10_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); + dst4[i] = (pel10_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); + dst5[i] = (pel10_t)((src[3] * 3 + src[4] * 11 + src[5] * 13 + src[6] * 5 + 16) >> 5); + dst6[i] = (pel10_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7); + dst7[i] = (pel10_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32) >> 6); + dst8[i] = (pel10_t)((src[5] * 3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6); + } + } else { + intra_pred_ang10_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy); } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_8_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_8_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[2 * (64 + 32)]); + int line_size = bsx + (bsy >> 1) - 1; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + int i_dst2 = i_dst << 1; + int i; + pel8_t *pfirst[2]; + + pfirst[0] = first_line; + pfirst[1] = first_line + aligned_line_size; + for (i = 0; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + pfirst[1][i] = (pel8_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + } + + bsy >>= 1; + for (i = 0; i < bsy; i++) { + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t)); + dst += i_dst2; + } +} + +static void intra_pred_ang10_x_8_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[2 * (64 + 32)]); + ALIGN16(pel10_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; - pel_t *pfirst[2]; + pel10_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; for (i = 0; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); - pfirst[1][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + pfirst[1][i] = (pel10_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); } bsy >>= 1; for (i = 0; i < bsy; i++) { - memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_9_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsy > 8) { - intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); + intra_pred_ang8_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy); /* - ALIGN16(pel_t first_line[(64 + 32) * 11]); + ALIGN16(pel8_t first_line[(64 + 32) * 11]); int line_size = bsx + (bsy * 93 >> 8) - 1; int real_size = XAVS2_MIN(line_size, bsx * 2); int aligned_line_size = ((line_size + 31) >> 5) << 5; int i_dst11 = i_dst * 11; int i; - pel_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11; - pel_t *pfirst[11]; + pel8_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11; + pel8_t *pfirst[11]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; @@ -711,17 +1226,17 @@ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode pfirst[9] = pfirst[8] + aligned_line_size; pfirst[10] = pfirst[9] + aligned_line_size; for (i = 0; i < real_size; i++, src++) { - pfirst[0][i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); - pfirst[1][i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); - pfirst[2][i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6); - pfirst[3][i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); - pfirst[4][i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); - pfirst[5][i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); - pfirst[6][i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); - pfirst[7][i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); - pfirst[8][i] = (pel_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4); - pfirst[9][i] = (pel_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5); - pfirst[10][i] = (pel_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7); + pfirst[0][i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + pfirst[1][i] = (pel8_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + pfirst[2][i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6); + pfirst[3][i] = (pel8_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + pfirst[4][i] = (pel8_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); + pfirst[5][i] = (pel8_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); + pfirst[6][i] = (pel8_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); + pfirst[7][i] = (pel8_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); + pfirst[8][i] = (pel8_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4); + pfirst[9][i] = (pel8_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5); + pfirst[10][i] = (pel8_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7); } // padding @@ -776,75 +1291,266 @@ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode int bsy_b = bsy / 11; for (i = 0; i < bsy_b; i++) { - memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); - memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); - memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); - memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); - memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); - memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); - memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); - memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel_t)); - memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel_t)); - memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[0] + i, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel8_t)); dst += i_dst11; } int bsy_r = bsy - bsy_b * 11; for (i = 0; i < bsy_r; i++) { - memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel8_t)); dst += i_dst; } */ } else if (bsy == 8) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; for (int i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); - dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); - dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); - dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + dst1[i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + dst2[i] = (pel8_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + dst3[i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); + dst4[i] = (pel8_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); - dst5[i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); - dst6[i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); - dst7[i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); - dst8[i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); + dst5[i] = (pel8_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); + dst6[i] = (pel8_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); + dst7[i] = (pel8_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); + dst8[i] = (pel8_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); } } else { /*if (bsy == 4)*/ - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; for (int i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); - dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); - dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); - dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + dst1[i] = (pel8_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + dst2[i] = (pel8_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + dst3[i] = (pel8_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); + dst4[i] = (pel8_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); } } +} + +static void intra_pred_ang10_x_9_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + if (bsy > 8) { + intra_pred_ang10_x_c(h, src, dst, i_dst, dir_mode, bsx, bsy); + /* + ALIGN16(pel10_t first_line[(64 + 32) * 11]); + int line_size = bsx + (bsy * 93 >> 8) - 1; + int real_size = XAVS2_MIN(line_size, bsx * 2); + int aligned_line_size = ((line_size + 31) >> 5) << 5; + int i_dst11 = i_dst * 11; + int i; + pel10_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11; + pel10_t *pfirst[11]; + + pfirst[0] = first_line; + pfirst[1] = pfirst[0] + aligned_line_size; + pfirst[2] = pfirst[1] + aligned_line_size; + pfirst[3] = pfirst[2] + aligned_line_size; + pfirst[4] = pfirst[3] + aligned_line_size; + pfirst[5] = pfirst[4] + aligned_line_size; + pfirst[6] = pfirst[5] + aligned_line_size; + pfirst[7] = pfirst[6] + aligned_line_size; + pfirst[8] = pfirst[7] + aligned_line_size; + pfirst[9] = pfirst[8] + aligned_line_size; + pfirst[10] = pfirst[9] + aligned_line_size; + for (i = 0; i < real_size; i++, src++) { + pfirst[0][i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + pfirst[1][i] = (pel10_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + pfirst[2][i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6); + pfirst[3][i] = (pel10_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + pfirst[4][i] = (pel10_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); + pfirst[5][i] = (pel10_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); + pfirst[6][i] = (pel10_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); + pfirst[7][i] = (pel10_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); + pfirst[8][i] = (pel10_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4); + pfirst[9][i] = (pel10_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5); + pfirst[10][i] = (pel10_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7); + } + + // padding + if (real_size < line_size) { + pfirst[8][real_size - 3] = pfirst[8][real_size - 4]; + pfirst[9][real_size - 3] = pfirst[9][real_size - 4]; + pfirst[10][real_size - 3] = pfirst[10][real_size - 4]; + pfirst[8][real_size - 2] = pfirst[8][real_size - 3]; + pfirst[9][real_size - 2] = pfirst[9][real_size - 3]; + pfirst[10][real_size - 2] = pfirst[10][real_size - 3]; + pfirst[8][real_size - 1] = pfirst[8][real_size - 2]; + pfirst[9][real_size - 1] = pfirst[9][real_size - 2]; + pfirst[10][real_size - 1] = pfirst[10][real_size - 2]; + + pfirst[5][real_size - 2] = pfirst[5][real_size - 3]; + pfirst[6][real_size - 2] = pfirst[6][real_size - 3]; + pfirst[7][real_size - 2] = pfirst[7][real_size - 3]; + pfirst[5][real_size - 1] = pfirst[5][real_size - 2]; + pfirst[6][real_size - 1] = pfirst[6][real_size - 2]; + pfirst[7][real_size - 1] = pfirst[7][real_size - 2]; + + pfirst[2][real_size - 1] = pfirst[2][real_size - 2]; + pfirst[3][real_size - 1] = pfirst[3][real_size - 2]; + pfirst[4][real_size - 1] = pfirst[4][real_size - 2]; + + + pad1 = pfirst[0][real_size - 1]; + pad2 = pfirst[1][real_size - 1]; + pad3 = pfirst[2][real_size - 1]; + pad4 = pfirst[3][real_size - 1]; + pad5 = pfirst[4][real_size - 1]; + pad6 = pfirst[5][real_size - 1]; + pad7 = pfirst[6][real_size - 1]; + pad8 = pfirst[7][real_size - 1]; + pad9 = pfirst[8][real_size - 1]; + pad10 = pfirst[9][real_size - 1]; + pad11 = pfirst[10][real_size - 1]; + for (; i < line_size; i++) { + pfirst[0][i] = pad1; + pfirst[1][i] = pad2; + pfirst[2][i] = pad3; + pfirst[3][i] = pad4; + pfirst[4][i] = pad5; + pfirst[5][i] = pad6; + pfirst[6][i] = pad7; + pfirst[7][i] = pad8; + pfirst[8][i] = pad9; + pfirst[9][i] = pad10; + pfirst[10][i] = pad11; + } + } + + int bsy_b = bsy / 11; + for (i = 0; i < bsy_b; i++) { + memcpy(dst, pfirst[0] + i, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel10_t)); + dst += i_dst11; + } + int bsy_r = bsy - bsy_b * 11; + for (i = 0; i < bsy_r; i++) { + memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel10_t)); + dst += i_dst; + } + */ + } else if (bsy == 8) { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + for (int i = 0; i < bsx; i++, src++) { + dst1[i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + dst2[i] = (pel10_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + dst3[i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); + dst4[i] = (pel10_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + dst5[i] = (pel10_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); + dst6[i] = (pel10_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); + dst7[i] = (pel10_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); + dst8[i] = (pel10_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); + } + } else { /*if (bsy == 4)*/ + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + for (int i = 0; i < bsx; i++, src++) { + dst1[i] = (pel10_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); + dst2[i] = (pel10_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); + dst3[i] = (pel10_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); + dst4[i] = (pel10_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); + } + } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_10_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + int i; + + if (bsy != 4) { + ALIGN16(pel8_t first_line[4 * (64 + 16)]); + int line_size = bsx + bsy / 4 - 1; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + pel8_t *pfirst[4]; + + pfirst[0] = first_line; + pfirst[1] = first_line + aligned_line_size; + pfirst[2] = first_line + aligned_line_size * 2; + pfirst[3] = first_line + aligned_line_size * 3; + + for (i = 0; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); + pfirst[1][i] = (pel8_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + pfirst[2][i] = (pel8_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); + pfirst[3][i] = (pel8_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); + } + + bsy >>= 2; + i_dst <<= 2; + for (i = 0; i < bsy; i++) { + memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel8_t)); + memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel8_t)); + memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel8_t)); + memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel8_t)); + dst1 += i_dst; + dst2 += i_dst; + dst3 += i_dst; + dst4 += i_dst; + } + } else { + for (i = 0; i < bsx; i++, src++) { + dst1[i] = (pel8_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); + dst2[i] = (pel8_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + dst3[i] = (pel8_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); + dst4[i] = (pel8_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); + } + } +} + +static void intra_pred_ang10_x_10_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; int i; if (bsy != 4) { - ALIGN16(pel_t first_line[4 * (64 + 16)]); + ALIGN16(pel10_t first_line[4 * (64 + 16)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; - pel_t *pfirst[4]; + pel10_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; @@ -852,19 +1558,19 @@ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); - pfirst[1][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); - pfirst[2][i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); - pfirst[3][i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); + pfirst[1][i] = (pel10_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + pfirst[2][i] = (pel10_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); + pfirst[3][i] = (pel10_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } bsy >>= 2; i_dst <<= 2; for (i = 0; i < bsy; i++) { - memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel_t)); - memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel_t)); - memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel_t)); - memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel_t)); + memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel10_t)); + memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel10_t)); + memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel10_t)); + memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel10_t)); dst1 += i_dst; dst2 += i_dst; dst3 += i_dst; @@ -872,25 +1578,25 @@ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod } } else { for (i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); - dst2[i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); - dst3[i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); - dst4[i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); + dst1[i] = (pel10_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); + dst2[i] = (pel10_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); + dst3[i] = (pel10_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); + dst4[i] = (pel10_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_x_11_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { - ALIGN16(pel_t first_line[(64 + 16) << 3]); + ALIGN16(pel8_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst8 = i_dst << 3; - pel_t *pfirst[8]; + pel8_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; @@ -901,108 +1607,230 @@ static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; for (i = 0; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); - pfirst[1][i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); - pfirst[2][i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); - pfirst[3][i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + pfirst[0][i] = (pel8_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + pfirst[1][i] = (pel8_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + pfirst[2][i] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + pfirst[3][i] = (pel8_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); - pfirst[4][i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); - pfirst[5][i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); - pfirst[6][i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); - pfirst[7][i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + 0 * src[4] + 2) >> 2); + pfirst[4][i] = (pel8_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); + pfirst[5][i] = (pel8_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); + pfirst[6][i] = (pel8_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); + pfirst[7][i] = (pel8_t)(( src[1] + 2 * src[2] + src[3] + 0 * src[4] + 2) >> 2); } bsy >>= 3; for (i = 0; i < bsy; i++) { - memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); - memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); - memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); - memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); - memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); - memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); - memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel8_t)); + memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel8_t)); dst += i_dst8; } } else if (bsy == 8) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); - dst2[i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); - dst3[i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); - dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + dst1[i] = (pel8_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + dst2[i] = (pel8_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + dst3[i] = (pel8_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst4[i] = (pel8_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); - dst5[i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); - dst6[i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); - dst7[i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); - dst8[i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + + 2) >> 2); + dst5[i] = (pel8_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); + dst6[i] = (pel8_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); + dst7[i] = (pel8_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); + dst8[i] = (pel8_t)(( src[1] + 2 * src[2] + src[3] + + 2) >> 2); } } else { for (i = 0; i < bsx; i++, src++) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - dst1[i] = (pel_t)(( 7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); - dst2[i] = (pel_t)(( 3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); - dst3[i] = (pel_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); - dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + dst1[i] = (pel8_t)(( 7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + dst2[i] = (pel8_t)(( 3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + dst3[i] = (pel8_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst4[i] = (pel8_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); } } } +static void intra_pred_ang10_x_11_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + if (bsy > 8) { + ALIGN16(pel10_t first_line[(64 + 16) << 3]); + int line_size = bsx + (bsy >> 3) - 1; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + int i_dst8 = i_dst << 3; + pel10_t *pfirst[8]; + + pfirst[0] = first_line; + pfirst[1] = pfirst[0] + aligned_line_size; + pfirst[2] = pfirst[1] + aligned_line_size; + pfirst[3] = pfirst[2] + aligned_line_size; + pfirst[4] = pfirst[3] + aligned_line_size; + pfirst[5] = pfirst[4] + aligned_line_size; + pfirst[6] = pfirst[5] + aligned_line_size; + pfirst[7] = pfirst[6] + aligned_line_size; + for (i = 0; i < line_size; i++, src++) { + pfirst[0][i] = (pel10_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + pfirst[1][i] = (pel10_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + pfirst[2][i] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + pfirst[3][i] = (pel10_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + + pfirst[4][i] = (pel10_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); + pfirst[5][i] = (pel10_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); + pfirst[6][i] = (pel10_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); + pfirst[7][i] = (pel10_t)(( src[1] + 2 * src[2] + src[3] + 0 * src[4] + 2) >> 2); + } + + bsy >>= 3; + for (i = 0; i < bsy; i++) { + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel10_t)); + memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel10_t)); + dst += i_dst8; + } + } else if (bsy == 8) { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + for (i = 0; i < bsx; i++, src++) { + dst1[i] = (pel10_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + dst2[i] = (pel10_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + dst3[i] = (pel10_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst4[i] = (pel10_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + + dst5[i] = (pel10_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); + dst6[i] = (pel10_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); + dst7[i] = (pel10_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); + dst8[i] = (pel10_t)(( src[1] + 2 * src[2] + src[3] + + 2) >> 2); + } + } else { + for (i = 0; i < bsx; i++, src++) { + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + dst1[i] = (pel10_t)(( 7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); + dst2[i] = (pel10_t)(( 3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); + dst3[i] = (pel10_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); + dst4[i] = (pel10_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); + } + } +} /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_25_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { - ALIGN16(pel_t first_line[64 + (64 << 3)]); + ALIGN16(pel8_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; for (i = 0; i < line_size; i += 8, src--) { - first_line[0 + i] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); - first_line[1 + i] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); - first_line[2 + i] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); - first_line[3 + i] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + first_line[0 + i] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + first_line[1 + i] = (pel8_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + first_line[2 + i] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + first_line[3 + i] = (pel8_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); - first_line[4 + i] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); - first_line[5 + i] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); - first_line[6 + i] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); - first_line[7 + i] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); + first_line[4 + i] = (pel8_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); + first_line[5 + i] = (pel8_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + first_line[6 + i] = (pel8_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); + first_line[7 + i] = (pel8_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); } for (i = 0; i < iHeight8; i += 8) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); - dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); - dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); - dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); - - dst[4] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); - dst[5] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); - dst[6] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); - dst[7] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); + dst[0] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + dst[1] = (pel8_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + dst[2] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + dst[3] = (pel8_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + + dst[4] = (pel8_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); + dst[5] = (pel8_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + dst[6] = (pel8_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); + dst[7] = (pel8_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); - dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); - dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); - dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + dst[0] = (pel8_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + dst[1] = (pel8_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + dst[2] = (pel8_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + dst[3] = (pel8_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + dst += i_dst; + } + } +} + +static void intra_pred_ang10_y_25_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + + if (bsx > 8) { + ALIGN16(pel10_t first_line[64 + (64 << 3)]); + int line_size = bsx + ((bsy - 1) << 3); + int iHeight8 = bsy << 3; + for (i = 0; i < line_size; i += 8, src--) { + first_line[0 + i] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + first_line[1 + i] = (pel10_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + first_line[2 + i] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + first_line[3 + i] = (pel10_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + + first_line[4 + i] = (pel10_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); + first_line[5 + i] = (pel10_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + first_line[6 + i] = (pel10_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); + first_line[7 + i] = (pel10_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); + } + for (i = 0; i < iHeight8; i += 8) { + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); + dst += i_dst; + } + } else if (bsx == 8) { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + dst[1] = (pel10_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + dst[2] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + dst[3] = (pel10_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); + + dst[4] = (pel10_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); + dst[5] = (pel10_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + dst[6] = (pel10_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); + dst[7] = (pel10_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); + dst += i_dst; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel10_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); + dst[1] = (pel10_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); + dst[2] = (pel10_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); + dst[3] = (pel10_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); dst += i_dst; } } @@ -1010,32 +1838,63 @@ static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_26_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { - ALIGN16(pel_t first_line[64 + 256]); + ALIGN16(pel8_t first_line[64 + 256]); int line_size = bsx + ((bsy - 1) << 2); int iHeight4 = bsy << 2; for (i = 0; i < line_size; i += 4, src--) { - first_line[i ] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); - first_line[i + 1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); - first_line[i + 2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); - first_line[i + 3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); + first_line[i ] = (pel8_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); + first_line[i + 1] = (pel8_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + first_line[i + 2] = (pel8_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + first_line[i + 3] = (pel8_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); } for (i = 0; i < iHeight4; i += 4) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); - dst[1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); - dst[2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); - dst[3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); + dst[0] = (pel8_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); + dst[1] = (pel8_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + dst[2] = (pel8_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + dst[3] = (pel8_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); + dst += i_dst; + } + } +} + +static void intra_pred_ang10_y_26_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + + if (bsx != 4) { + ALIGN16(pel10_t first_line[64 + 256]); + int line_size = bsx + ((bsy - 1) << 2); + int iHeight4 = bsy << 2; + + for (i = 0; i < line_size; i += 4, src--) { + first_line[i ] = (pel10_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); + first_line[i + 1] = (pel10_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + first_line[i + 2] = (pel10_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + first_line[i + 3] = (pel10_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); + } + + for (i = 0; i < iHeight4; i += 4) { + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); + dst += i_dst; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel10_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); + dst[1] = (pel10_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + dst[2] = (pel10_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); + dst[3] = (pel10_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); dst += i_dst; } } @@ -1043,30 +1902,59 @@ static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_27_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + if (bsx > 8) { + intra_pred_ang8_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy); + } else if (bsx == 8) { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); + dst[1] = (pel8_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); + dst[2] = (pel8_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); + dst[3] = (pel8_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); + + dst[4] = (pel8_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6); + dst[5] = (pel8_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] + 5 * src[-5] + 64) >> 7); + dst[6] = (pel8_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7); + dst[7] = (pel8_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7); + dst += i_dst; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); + dst[1] = (pel8_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); + dst[2] = (pel8_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); + dst[3] = (pel8_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); + dst += i_dst; + } + } +} + +static void intra_pred_ang10_y_27_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { - intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); + intra_pred_ang10_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); - dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); - dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); - dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); - - dst[4] = (pel_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6); - dst[5] = (pel_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] + 5 * src[-5] + 64) >> 7); - dst[6] = (pel_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7); - dst[7] = (pel_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7); + dst[0] = (pel10_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); + dst[1] = (pel10_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); + dst[2] = (pel10_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); + dst[3] = (pel10_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); + + dst[4] = (pel10_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6); + dst[5] = (pel10_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] + 5 * src[-5] + 64) >> 7); + dst[6] = (pel10_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7); + dst[7] = (pel10_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); - dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); - dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); - dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); + dst[0] = (pel10_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); + dst[1] = (pel10_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); + dst[2] = (pel10_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); + dst[3] = (pel10_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); dst += i_dst; } } @@ -1074,80 +1962,185 @@ static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_28_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_28_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[64 + 128]); + ALIGN16(pel8_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; for (i = 0; i < line_size; i += 2, src--) { - first_line[i ] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); - first_line[i + 1] = (pel_t)((src[-1] + (src[-2] << 1) + src[-3] + 2) >> 2); + first_line[i ] = (pel8_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + first_line[i + 1] = (pel8_t)((src[-1] + (src[-2] << 1) + src[-3] + 2) >> 2); } for (i = 0; i < iHeight2; i += 2) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); + dst += i_dst; + } +} + +static void intra_pred_ang10_y_28_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel10_t first_line[64 + 128]); + int line_size = bsx + ((bsy - 1) << 1); + int iHeight2 = bsy << 1; + int i; + + for (i = 0; i < line_size; i += 2, src--) { + first_line[i ] = (pel10_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); + first_line[i + 1] = (pel10_t)((src[-1] + (src[-2] << 1) + src[-3] + 2) >> 2); + } + + for (i = 0; i < iHeight2; i += 2) { + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_29_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_29_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + if (bsx > 8) { + intra_pred_ang8_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy); + } else if (bsx == 8) { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); + dst[1] = (pel8_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); + dst[2] = (pel8_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); + dst[3] = (pel8_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); + + dst[4] = (pel8_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5); + dst[5] = (pel8_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7); + dst[6] = (pel8_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6); + dst[7] = (pel8_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6); + dst += i_dst; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); + dst[1] = (pel8_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); + dst[2] = (pel8_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); + dst[3] = (pel8_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); + dst += i_dst; + } + } +} + +static void intra_pred_ang10_y_29_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { - intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); + intra_pred_ang10_y_c(h, src, dst, i_dst, dir_mode, bsx, bsy); + } else if (bsx == 8) { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel10_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); + dst[1] = (pel10_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); + dst[2] = (pel10_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); + dst[3] = (pel10_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); + + dst[4] = (pel10_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5); + dst[5] = (pel10_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7); + dst[6] = (pel10_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6); + dst[7] = (pel10_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6); + dst += i_dst; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel10_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); + dst[1] = (pel10_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); + dst[2] = (pel10_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); + dst[3] = (pel10_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); + dst += i_dst; + } + } +} + +/* --------------------------------------------------------------------------- + */ +static void intra_pred_ang8_y_30_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[64 + 64]); + int line_size = bsx + bsy - 1; + int i; + + src -= 2; + for (i = 0; i < line_size; i++, src--) { + first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, first_line + i, bsx * sizeof(pel8_t)); + dst += i_dst; + } +} + +static void intra_pred_ang10_y_30_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel10_t first_line[64 + 64]); + int line_size = bsx + bsy - 1; + int i; + + src -= 2; + for (i = 0; i < line_size; i++, src--) { + first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, first_line + i, bsx * sizeof(pel10_t)); + dst += i_dst; + } +} + +/* --------------------------------------------------------------------------- + */ +static void intra_pred_ang8_y_31_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); + ALIGN16(pel8_t src_tran[MAX_CU_SIZE << 3]); + int i; + if (bsx >= bsy) { + // transposition + // i < (bsx * 19 / 8 + 3) + for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) { + src_tran[i] = src[-i]; + } + intra_pred_ang8_x_5_c(h, src_tran, dst_tran, bsy, 5, bsy, bsx); + for (i = 0; i < bsy; i++) { + for (int j = 0; j < bsx; j++) { + dst[j + i_dst * i] = dst_tran[i + bsy * j]; + } + } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); - dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); - dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); - dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); - - dst[4] = (pel_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5); - dst[5] = (pel_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7); - dst[6] = (pel_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6); - dst[7] = (pel_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6); + dst[0] = (pel8_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); + dst[1] = (pel8_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); + dst[2] = (pel8_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); + dst[3] = (pel8_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); + + dst[4] = (pel8_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5); + dst[5] = (pel8_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4); + dst[6] = (pel8_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5); + dst[7] = (pel8_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); - dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); - dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); - dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); + dst[0] = (pel8_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); + dst[1] = (pel8_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); + dst[2] = (pel8_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); + dst[3] = (pel8_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst += i_dst; } } } -/* --------------------------------------------------------------------------- - */ -static void intra_pred_ang_y_30_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) -{ - ALIGN16(pel_t first_line[64 + 64]); - int line_size = bsx + bsy - 1; - int i; - - src -= 2; - for (i = 0; i < line_size; i++, src--) { - first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); - } - - for (i = 0; i < bsy; i++) { - memcpy(dst, first_line + i, bsx * sizeof(pel_t)); - dst += i_dst; - } -} - -/* --------------------------------------------------------------------------- - */ -static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang10_y_31_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); - ALIGN16(pel_t src_tran[MAX_CU_SIZE << 3]); + ALIGN16(pel10_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); + ALIGN16(pel10_t src_tran[MAX_CU_SIZE << 3]); int i; if (bsx >= bsy) { // transposition @@ -1155,7 +2148,7 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) { src_tran[i] = src[-i]; } - intra_pred_ang_x_5_c(src_tran, dst_tran, bsy, 5, bsy, bsx); + intra_pred_ang10_x_5_c(h, src_tran, dst_tran, bsy, 5, bsy, bsx); for (i = 0; i < bsy; i++) { for (int j = 0; j < bsx; j++) { dst[j + i_dst * i] = dst_tran[i + bsy * j]; @@ -1163,23 +2156,23 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); - dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); - dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); - dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); - - dst[4] = (pel_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5); - dst[5] = (pel_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4); - dst[6] = (pel_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5); - dst[7] = (pel_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2); + dst[0] = (pel10_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); + dst[1] = (pel10_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); + dst[2] = (pel10_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); + dst[3] = (pel10_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); + + dst[4] = (pel10_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5); + dst[5] = (pel10_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4); + dst[6] = (pel10_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5); + dst[7] = (pel10_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); - dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); - dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); - dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); + dst[0] = (pel10_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); + dst[1] = (pel10_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); + dst[2] = (pel10_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); + dst[3] = (pel10_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst += i_dst; } } @@ -1187,43 +2180,175 @@ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mod /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_y_32_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_y_32_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[2 * (32 + 64)]); + int line_size = (bsy >> 1) + bsx - 1; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + int i_dst2 = i_dst << 1; + int i; + pel8_t *pfirst[2]; + + pfirst[0] = first_line; + pfirst[1] = first_line + aligned_line_size; + + src -= 3; + for (i = 0; i < line_size; i++, src -= 2) { + pfirst[0][i] = (pel8_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2); + pfirst[1][i] = (pel8_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2); + } + + bsy >>= 1; + for (i = 0; i < bsy; i++) { + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel8_t)); + dst += i_dst2; + } +} + +static void intra_pred_ang10_y_32_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[2 * (32 + 64)]); + ALIGN16(pel10_t first_line[2 * (32 + 64)]); int line_size = (bsy >> 1) + bsx - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; - pel_t *pfirst[2]; + pel10_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 3; for (i = 0; i < line_size; i++, src -= 2) { - pfirst[0][i] = (pel_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2); - pfirst[1][i] = (pel_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2); + pfirst[1][i] = (pel10_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2); } bsy >>= 1; for (i = 0; i < bsy; i++) { - memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); + memcpy(dst , pfirst[0] + i, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel10_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_13_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + if (bsy > 8) { + ALIGN16(pel8_t first_line[(64 + 16) << 3]); + int line_size = bsx + (bsy >> 3) - 1; + int left_size = line_size - bsx; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + pel8_t *pfirst[8]; + + pfirst[0] = first_line; + pfirst[1] = pfirst[0] + aligned_line_size; + pfirst[2] = pfirst[1] + aligned_line_size; + pfirst[3] = pfirst[2] + aligned_line_size; + pfirst[4] = pfirst[3] + aligned_line_size; + pfirst[5] = pfirst[4] + aligned_line_size; + pfirst[6] = pfirst[5] + aligned_line_size; + pfirst[7] = pfirst[6] + aligned_line_size; + + src -= bsy - 8; + for (i = 0; i < left_size; i++, src += 8) { + pfirst[0][i] = (pel8_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); + pfirst[1][i] = (pel8_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); + pfirst[2][i] = (pel8_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); + pfirst[3][i] = (pel8_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); + + pfirst[4][i] = (pel8_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); + pfirst[5][i] = (pel8_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + pfirst[6][i] = (pel8_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[7][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + pfirst[1][i] = (pel8_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + pfirst[2][i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + pfirst[3][i] = (pel8_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + + pfirst[4][i] = (pel8_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); + pfirst[5][i] = (pel8_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); + pfirst[6][i] = (pel8_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); + pfirst[7][i] = (pel8_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); + } + + pfirst[0] += left_size; + pfirst[1] += left_size; + pfirst[2] += left_size; + pfirst[3] += left_size; + pfirst[4] += left_size; + pfirst[5] += left_size; + pfirst[6] += left_size; + pfirst[7] += left_size; + + bsy >>= 3; + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst[0] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[1] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[2] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[3] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[4] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[5] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[6] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[7] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + } + } else if (bsy == 8) { + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + pel8_t *dst5 = dst4 + i_dst; + pel8_t *dst6 = dst5 + i_dst; + pel8_t *dst7 = dst6 + i_dst; + pel8_t *dst8 = dst7 + i_dst; + for (i = 0; i < bsx; i++, src++) { + dst1[i] = (pel8_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + dst2[i] = (pel8_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + dst3[i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + dst4[i] = (pel8_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + + dst5[i] = (pel8_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); + dst6[i] = (pel8_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); + dst7[i] = (pel8_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); + dst8[i] = (pel8_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); + } + } else { + for (i = 0; i < bsx; i++, src++) { + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + dst1[i] = (pel8_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + dst2[i] = (pel8_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + dst3[i] = (pel8_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + dst4[i] = (pel8_t)(( src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + } + } +} + +static void intra_pred_ang10_xy_13_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { - ALIGN16(pel_t first_line[(64 + 16) << 3]); + ALIGN16(pel10_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; - pel_t *pfirst[8]; + pel10_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; @@ -1236,27 +2361,27 @@ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) { - pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); - pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); - pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); - pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); + pfirst[1][i] = (pel10_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); + pfirst[2][i] = (pel10_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); + pfirst[3][i] = (pel10_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); - pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); - pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); - pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); - pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + pfirst[4][i] = (pel10_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); + pfirst[5][i] = (pel10_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); + pfirst[6][i] = (pel10_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[7][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); - pfirst[1][i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); - pfirst[2][i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); - pfirst[3][i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + pfirst[0][i] = (pel10_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + pfirst[1][i] = (pel10_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + pfirst[2][i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + pfirst[3][i] = (pel10_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); - pfirst[4][i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); - pfirst[5][i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); - pfirst[6][i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); - pfirst[7][i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); + pfirst[4][i] = (pel10_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); + pfirst[5][i] = (pel10_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); + pfirst[6][i] = (pel10_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); + pfirst[7][i] = (pel10_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); } pfirst[0] += left_size; @@ -1270,66 +2395,131 @@ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo bsy >>= 3; for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[0] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[1] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[2] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[3] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[4] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[5] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[6] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[7] - i, bsx * sizeof(pel10_t)); dst += i_dst; } } else if (bsy == 8) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - pel_t *dst5 = dst4 + i_dst; - pel_t *dst6 = dst5 + i_dst; - pel_t *dst7 = dst6 + i_dst; - pel_t *dst8 = dst7 + i_dst; + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + pel10_t *dst5 = dst4 + i_dst; + pel10_t *dst6 = dst5 + i_dst; + pel10_t *dst7 = dst6 + i_dst; + pel10_t *dst8 = dst7 + i_dst; + for (i = 0; i < bsx; i++, src++) { + dst1[i] = (pel10_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + dst2[i] = (pel10_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + dst3[i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + dst4[i] = (pel10_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + + dst5[i] = (pel10_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); + dst6[i] = (pel10_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); + dst7[i] = (pel10_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); + dst8[i] = (pel10_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); + } + } else { for (i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); - dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); - dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); - dst4[i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; + dst1[i] = (pel10_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); + dst2[i] = (pel10_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); + dst3[i] = (pel10_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); + dst4[i] = (pel10_t)(( src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + } + } +} + +/* --------------------------------------------------------------------------- + */ +static void intra_pred_ang8_xy_14_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + + if (bsy != 4) { + ALIGN16(pel8_t first_line[4 * (64 + 16)]); + int line_size = bsx + (bsy >> 2) - 1; + int left_size = line_size - bsx; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + pel8_t *pfirst[4]; + + pfirst[0] = first_line; + pfirst[1] = first_line + aligned_line_size; + pfirst[2] = first_line + aligned_line_size * 2; + pfirst[3] = first_line + aligned_line_size * 3; + + src -= bsy - 4; + for (i = 0; i < left_size; i++, src += 4) { + pfirst[0][i] = (pel8_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2); + pfirst[1][i] = (pel8_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2); + pfirst[2][i] = (pel8_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[3][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + pfirst[1][i] = (pel8_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + pfirst[2][i] = (pel8_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + pfirst[3][i] = (pel8_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); + } + + pfirst[0] += left_size; + pfirst[1] += left_size; + pfirst[2] += left_size; + pfirst[3] += left_size; - dst5[i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); - dst6[i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); - dst7[i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); - dst8[i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); + bsy >>= 2; + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst[0] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[1] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[2] - i, bsx * sizeof(pel8_t)); + dst += i_dst; + memcpy(dst, pfirst[3] - i, bsx * sizeof(pel8_t)); + dst += i_dst; } } else { + pel8_t *dst1 = dst; + pel8_t *dst2 = dst1 + i_dst; + pel8_t *dst3 = dst2 + i_dst; + pel8_t *dst4 = dst3 + i_dst; + for (i = 0; i < bsx; i++, src++) { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; - dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); - dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); - dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); - dst4[i] = (pel_t)(( src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); + dst1[i] = (pel8_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + dst2[i] = (pel8_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + dst3[i] = (pel8_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + dst4[i] = (pel8_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } } } -static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) + +static void intra_pred_ang10_xy_14_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy != 4) { - ALIGN16(pel_t first_line[4 * (64 + 16)]); + ALIGN16(pel10_t first_line[4 * (64 + 16)]); int line_size = bsx + (bsy >> 2) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; - pel_t *pfirst[4]; + pel10_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; @@ -1338,17 +2528,17 @@ static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo src -= bsy - 4; for (i = 0; i < left_size; i++, src += 4) { - pfirst[0][i] = (pel_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2); - pfirst[1][i] = (pel_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2); - pfirst[2][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); - pfirst[3][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2); + pfirst[1][i] = (pel10_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2); + pfirst[2][i] = (pel10_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[3][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); - pfirst[1][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); - pfirst[2][i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); - pfirst[3][i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + pfirst[1][i] = (pel10_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + pfirst[2][i] = (pel10_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + pfirst[3][i] = (pel10_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } pfirst[0] += left_size; @@ -1358,40 +2548,75 @@ static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo bsy >>= 2; for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[0] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[1] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[2] - i, bsx * sizeof(pel10_t)); dst += i_dst; - memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); + memcpy(dst, pfirst[3] - i, bsx * sizeof(pel10_t)); dst += i_dst; } } else { - pel_t *dst1 = dst; - pel_t *dst2 = dst1 + i_dst; - pel_t *dst3 = dst2 + i_dst; - pel_t *dst4 = dst3 + i_dst; + pel10_t *dst1 = dst; + pel10_t *dst2 = dst1 + i_dst; + pel10_t *dst3 = dst2 + i_dst; + pel10_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { - dst1[i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); - dst2[i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); - dst3[i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); - dst4[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); + dst1[i] = (pel10_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + dst2[i] = (pel10_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + dst3[i] = (pel10_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + dst4[i] = (pel10_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_16_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[2 * (64 + 32)]); + int line_size = bsx + (bsy >> 1) - 1; + int left_size = line_size - bsx; + int aligned_line_size = ((line_size + 15) >> 4) << 4; + int i_dst2 = i_dst << 1; + pel8_t *pfirst[2]; + int i; + + pfirst[0] = first_line; + pfirst[1] = first_line + aligned_line_size; + + src -= bsy - 2; + for (i = 0; i < left_size; i++, src += 2) { + pfirst[0][i] = (pel8_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[1][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (; i < line_size; i++, src++) { + pfirst[0][i] = (pel8_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + pfirst[1][i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + pfirst[0] += left_size; + pfirst[1] += left_size; + + bsy >>= 1; + for (i = 0; i < bsy; i++) { + memcpy(dst , pfirst[0] - i, bsx * sizeof(pel8_t)); + memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel8_t)); + dst += i_dst2; + } +} + +static void intra_pred_ang10_xy_16_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[2 * (64 + 32)]); + ALIGN16(pel10_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; - pel_t *pfirst[2]; + pel10_t *pfirst[2]; int i; pfirst[0] = first_line; @@ -1399,13 +2624,13 @@ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo src -= bsy - 2; for (i = 0; i < left_size; i++, src += 2) { - pfirst[0][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); - pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); + pfirst[1][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { - pfirst[0][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); - pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + pfirst[0][i] = (pel10_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + pfirst[1][i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } pfirst[0] += left_size; @@ -1413,28 +2638,47 @@ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo bsy >>= 1; for (i = 0; i < bsy; i++) { - memcpy(dst , pfirst[0] - i, bsx * sizeof(pel_t)); - memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); + memcpy(dst , pfirst[0] - i, bsx * sizeof(pel10_t)); + memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel10_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_18_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[64 + 64]); + int line_size = bsx + bsy - 1; + int i; + pel8_t *pfirst = first_line + bsy - 1; + + src -= bsy - 1; + for (i = 0; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst, bsx * sizeof(pel8_t)); + pfirst--; + dst += i_dst; + } +} + +static void intra_pred_ang10_xy_18_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[64 + 64]); + ALIGN16(pel10_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; - pel_t *pfirst = first_line + bsy - 1; + pel10_t *pfirst = first_line + bsy - 1; src -= bsy - 1; for (i = 0; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst, bsx * sizeof(pel_t)); + memcpy(dst, pfirst, bsx * sizeof(pel10_t)); pfirst--; dst += i_dst; } @@ -1442,28 +2686,55 @@ static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_20_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + ALIGN16(pel8_t first_line[64 + 128]); + int left_size = ((bsy - 1) << 1) + 1; + int top_size = bsx - 1; + int line_size = left_size + top_size; + int i; + pel8_t *pfirst = first_line + left_size - 1; + + src -= bsy; + for (i = 0; i < left_size; i += 2, src++) { + first_line[i ] = (pel8_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + first_line[i + 1] = (pel8_t)(( src[0] + (src[1] << 1) + src[2] + 2) >> 2); + } + i--; + + for (; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst, bsx * sizeof(pel8_t)); + pfirst -= 2; + dst += i_dst; + } +} + +static void intra_pred_ang10_xy_20_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { - ALIGN16(pel_t first_line[64 + 128]); + ALIGN16(pel10_t first_line[64 + 128]); int left_size = ((bsy - 1) << 1) + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; - pel_t *pfirst = first_line + left_size - 1; + pel10_t *pfirst = first_line + left_size - 1; src -= bsy; for (i = 0; i < left_size; i += 2, src++) { - first_line[i ] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); - first_line[i + 1] = (pel_t)(( src[0] + (src[1] << 1) + src[2] + 2) >> 2); + first_line[i ] = (pel10_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + first_line[i + 1] = (pel10_t)(( src[0] + (src[1] << 1) + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst, bsx * sizeof(pel_t)); + memcpy(dst, pfirst, bsx * sizeof(pel10_t)); pfirst -= 2; dst += i_dst; } @@ -1471,41 +2742,82 @@ static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_22_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + + if (bsx != 4) { + src -= bsy; + ALIGN16(pel8_t first_line[64 + 256]); + int left_size = ((bsy - 1) << 2) + 3; + int top_size = bsx - 3; + int line_size = left_size + top_size; + pel8_t *pfirst = first_line + left_size - 3; + + for (i = 0; i < left_size; i += 4, src++) { + first_line[i ] = (pel8_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + first_line[i + 1] = (pel8_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + first_line[i + 2] = (pel8_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + first_line[i + 3] = (pel8_t)(( src[0] + src[1] * 2 + src[2] + 2) >> 2); + } + i--; + + for (; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst, bsx * sizeof(pel8_t)); + dst += i_dst; + pfirst -= 4; + } + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); + dst[1] = (pel8_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); + dst[2] = (pel8_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); + dst[3] = (pel8_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); + dst += i_dst; + } + // needn't pad, (3,0) is equal for ang_x and ang_y + } +} + +static void intra_pred_ang10_xy_22_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { src -= bsy; - ALIGN16(pel_t first_line[64 + 256]); + ALIGN16(pel10_t first_line[64 + 256]); int left_size = ((bsy - 1) << 2) + 3; int top_size = bsx - 3; int line_size = left_size + top_size; - pel_t *pfirst = first_line + left_size - 3; + pel10_t *pfirst = first_line + left_size - 3; for (i = 0; i < left_size; i += 4, src++) { - first_line[i ] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); - first_line[i + 1] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); - first_line[i + 2] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); - first_line[i + 3] = (pel_t)(( src[0] + src[1] * 2 + src[2] + 2) >> 2); + first_line[i ] = (pel10_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); + first_line[i + 1] = (pel10_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); + first_line[i + 2] = (pel10_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); + first_line[i + 3] = (pel10_t)(( src[0] + src[1] * 2 + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); + first_line[i] = (pel10_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst, bsx * sizeof(pel_t)); + memcpy(dst, pfirst, bsx * sizeof(pel10_t)); dst += i_dst; pfirst -= 4; } } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); - dst[1] = (pel_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); - dst[2] = (pel_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); - dst[3] = (pel_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); + dst[0] = (pel10_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); + dst[1] = (pel10_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); + dst[2] = (pel10_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); + dst[3] = (pel10_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); dst += i_dst; } // needn't pad, (3,0) is equal for ang_x and ang_y @@ -1514,60 +2826,119 @@ static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo /* --------------------------------------------------------------------------- */ -static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +static void intra_pred_ang8_xy_23_c(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +{ + int i; + + if (bsx > 8) { + ALIGN16(pel8_t first_line[64 + 512]); + int left_size = (bsy << 3) - 1; + int top_size = bsx - 7; + int line_size = left_size + top_size; + pel8_t *pfirst = first_line + left_size - 7; + + src -= bsy; + for (i = 0; i < left_size; i += 8, src++) { + first_line[i ] = (pel8_t)((7 * src[-1] + 15 * src[0] + 9 * src[1] + src[2] + 16) >> 5); + first_line[i + 1] = (pel8_t)((3 * src[-1] + 7 * src[0] + 5 * src[1] + src[2] + 8) >> 4); + first_line[i + 2] = (pel8_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5); + first_line[i + 3] = (pel8_t)(( src[-1] + 3 * src[0] + 3 * src[1] + src[2] + 4) >> 3); + + first_line[i + 4] = (pel8_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5); + first_line[i + 5] = (pel8_t)(( src[-1] + 5 * src[0] + 7 * src[1] + 3 * src[2] + 8) >> 4); + first_line[i + 6] = (pel8_t)(( src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); + first_line[i + 7] = (pel8_t)(( src[ 0] + 2 * src[1] + src[2] + 0 * src[3] + 2) >> 2); + } + i--; + + for (; i < line_size; i++, src++) { + first_line[i] = (pel8_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2); + } + + for (i = 0; i < bsy; i++) { + memcpy(dst, pfirst, bsx * sizeof(pel8_t)); + dst += i_dst; + pfirst -= 8; + } + } else if (bsx == 8) { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); + dst[1] = (pel8_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); + dst[2] = (pel8_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); + dst[3] = (pel8_t)(( src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); + + dst[4] = (pel8_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5); + dst[5] = (pel8_t)(( src[-2] + 5 * src[-1] + 7 * src[0] + 3 * src[1] + 8) >> 4); + dst[6] = (pel8_t)(( src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); + dst[7] = (pel8_t)(( src[-1] + 2 * src[ 0] + src[1] + 0 * src[2] + 2) >> 2); + dst += i_dst; + } + // needn't pad, (7,0) is equal for ang_x and ang_y + } else { + for (i = 0; i < bsy; i++, src--) { + dst[0] = (pel8_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); + dst[1] = (pel8_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); + dst[2] = (pel8_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); + dst[3] = (pel8_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); + dst += i_dst; + } + } +} + +static void intra_pred_ang10_xy_23_c(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { - ALIGN16(pel_t first_line[64 + 512]); + ALIGN16(pel10_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; - pel_t *pfirst = first_line + left_size - 7; + pel10_t *pfirst = first_line + left_size - 7; src -= bsy; for (i = 0; i < left_size; i += 8, src++) { - first_line[i ] = (pel_t)((7 * src[-1] + 15 * src[0] + 9 * src[1] + src[2] + 16) >> 5); - first_line[i + 1] = (pel_t)((3 * src[-1] + 7 * src[0] + 5 * src[1] + src[2] + 8) >> 4); - first_line[i + 2] = (pel_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5); - first_line[i + 3] = (pel_t)(( src[-1] + 3 * src[0] + 3 * src[1] + src[2] + 4) >> 3); + first_line[i ] = (pel10_t)((7 * src[-1] + 15 * src[0] + 9 * src[1] + src[2] + 16) >> 5); + first_line[i + 1] = (pel10_t)((3 * src[-1] + 7 * src[0] + 5 * src[1] + src[2] + 8) >> 4); + first_line[i + 2] = (pel10_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5); + first_line[i + 3] = (pel10_t)(( src[-1] + 3 * src[0] + 3 * src[1] + src[2] + 4) >> 3); - first_line[i + 4] = (pel_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5); - first_line[i + 5] = (pel_t)(( src[-1] + 5 * src[0] + 7 * src[1] + 3 * src[2] + 8) >> 4); - first_line[i + 6] = (pel_t)(( src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); - first_line[i + 7] = (pel_t)(( src[ 0] + 2 * src[1] + src[2] + 0 * src[3] + 2) >> 2); + first_line[i + 4] = (pel10_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5); + first_line[i + 5] = (pel10_t)(( src[-1] + 5 * src[0] + 7 * src[1] + 3 * src[2] + 8) >> 4); + first_line[i + 6] = (pel10_t)(( src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); + first_line[i + 7] = (pel10_t)(( src[ 0] + 2 * src[1] + src[2] + 0 * src[3] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { - first_line[i] = (pel_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2); + first_line[i] = (pel10_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2); } for (i = 0; i < bsy; i++) { - memcpy(dst, pfirst, bsx * sizeof(pel_t)); + memcpy(dst, pfirst, bsx * sizeof(pel10_t)); dst += i_dst; pfirst -= 8; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); - dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); - dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); - dst[3] = (pel_t)(( src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); - - dst[4] = (pel_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5); - dst[5] = (pel_t)(( src[-2] + 5 * src[-1] + 7 * src[0] + 3 * src[1] + 8) >> 4); - dst[6] = (pel_t)(( src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); - dst[7] = (pel_t)(( src[-1] + 2 * src[ 0] + src[1] + 0 * src[2] + 2) >> 2); + dst[0] = (pel10_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); + dst[1] = (pel10_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); + dst[2] = (pel10_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); + dst[3] = (pel10_t)(( src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); + + dst[4] = (pel10_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5); + dst[5] = (pel10_t)(( src[-2] + 5 * src[-1] + 7 * src[0] + 3 * src[1] + 8) >> 4); + dst[6] = (pel10_t)(( src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); + dst[7] = (pel10_t)(( src[-1] + 2 * src[ 0] + src[1] + 0 * src[2] + 2) >> 2); dst += i_dst; } // needn't pad, (7,0) is equal for ang_x and ang_y } else { for (i = 0; i < bsy; i++, src--) { - dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); - dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); - dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); - dst[3] = (pel_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); + dst[0] = (pel10_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); + dst[1] = (pel10_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); + dst[2] = (pel10_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); + dst[3] = (pel10_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); dst += i_dst; } } @@ -1575,15 +2946,76 @@ static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mo /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在上边界的PU + * LCU鍐呭湪涓婅竟鐣岀殑PU */ static -void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_reference_samples8_0_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy) +{ + int num_padding = 0; + + /* fill default value */ + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); + + /* get prediction pixels --------------------------------------- + * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels + * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 + */ + + /* fill top & top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + /* fill top pixels */ + g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel8_t)); + } + + /* fill top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { + g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel + } + + /* fill extra pixels */ + num_padding = bsy * 11 / 4 - bsx + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 + } + + /* fill left & left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + /* fill left pixels */ + memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel8_t)); + } + + /* fill left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { + memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); + } + + /* fill top-left pixel */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { + EP[0] = pLcuEP[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + EP[0] = pLcuEP[1]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + EP[0] = pLcuEP[-1]; + } + + /* fill extra pixels */ + num_padding = bsx * 11 / 4 - bsy + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 + } +} + +static +void fill_reference_samples10_0_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy) { int num_padding = 0; /* fill default value */ - g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -1593,12 +3025,12 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ - g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel10_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { - g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } @@ -1612,12 +3044,12 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ - memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); + memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel10_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { - memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); + memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } @@ -1640,16 +3072,89 @@ void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在上边界的PU + * LCU鍐呭湪涓婅竟鐣岀殑PU */ static -void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_reference_samples8_x_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy) +{ + const pel8_t *pL = pTL + i_TL; + int num_padding = 0; + + /* fill default value */ + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); + + /* get prediction pixels --------------------------------------- + * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels + * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 + */ + + /* fill top & top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + /* fill top pixels */ + g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel8_t)); + } + + /* fill top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { + g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel + } + + /* fill extra pixels */ + num_padding = bsy * 11 / 4 - bsx + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 + } + + /* fill left & left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + const pel8_t *p_l = pL; + int y; + /* fill left pixels */ + for (y = 0; y < bsy; y++) { + EP[-1 - y] = *p_l; + p_l += i_TL; + } + } + + /* fill left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { + int y; + const pel8_t *p_l = pL + bsy * i_TL; + + for (y = 0; y < bsy; y++) { + EP[-bsy - 1 - y] = *p_l; + p_l += i_TL; + } + } else { + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); + } + + /* fill top-left pixel */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { + EP[0] = pLcuEP[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + EP[0] = pLcuEP[1]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + EP[0] = pL[0]; + } + + /* fill extra pixels */ + num_padding = bsx * 11 / 4 - bsy + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 + } +} + +static +void fill_reference_samples10_x_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy) { - const pel_t *pL = pTL + i_TL; + const pel10_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ - g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -1659,12 +3164,12 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ - g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel10_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { - g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } @@ -1677,7 +3182,7 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { - const pel_t *p_l = pL; + const pel10_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { @@ -1689,7 +3194,7 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; - const pel_t *p_l = pL + bsy * i_TL; + const pel10_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; @@ -1717,16 +3222,78 @@ void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在左边界上的PU + * LCU鍐呭湪宸﹁竟鐣屼笂鐨凱U */ static -void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_reference_samples8_y_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy) +{ + const pel8_t *pT = pTL + 1; + int num_padding = 0; + + /* fill default value */ + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); + + /* get prediction pixels --------------------------------------- + * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels + * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 + */ + + /* fill top & top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + /* fill top pixels */ + g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel8_t)); + } + + /* fill top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { + g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel + } + + /* fill extra pixels */ + num_padding = bsy * 11 / 4 - bsx + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 + } + + /* fill left & left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + /* fill left pixels */ + memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel8_t)); + } + + /* fill left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { + memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); + } + + /* fill top-left pixel */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { + EP[0] = pLcuEP[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + EP[0] = pT[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + EP[0] = pLcuEP[-1]; + } + + /* fill extra pixels */ + num_padding = bsx * 11 / 4 - bsy + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 + } +} + +static +void fill_reference_samples10_y_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy) { - const pel_t *pT = pTL + 1; + const pel10_t *pT = pTL + 1; int num_padding = 0; /* fill default value */ - g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -1736,12 +3303,12 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ - g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel10_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { - g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } @@ -1755,12 +3322,12 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ - memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); + memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel10_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { - memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); + memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } @@ -1783,17 +3350,91 @@ void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内不在边界上的PU + * LCU鍐呬笉鍦ㄨ竟鐣屼笂鐨凱U */ static -void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_reference_samples8_xy_c(xavs2_t *h, const pel8_t *pTL, int i_TL, const pel8_t *pLcuEP, pel8_t *EP, uint32_t i_avai, int bsx, int bsy) +{ + const pel8_t *pT = pTL + 1; + const pel8_t *pL = pTL + i_TL; + int num_padding = 0; + + /* fill default value */ + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); + + /* get prediction pixels --------------------------------------- + * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels + * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 + */ + + /* fill top & top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + /* fill top pixels */ + g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel8_t)); + } + + /* fill top-right pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { + g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel8_t)); + } else { + g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel + } + + /* fill extra pixels */ + num_padding = bsy * 11 / 4 - bsx + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 + } + + /* fill left & left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + const pel8_t *p_l = pL; + int y; + /* fill left pixels */ + for (y = 0; y < bsy; y++) { + EP[-1 - y] = *p_l; + p_l += i_TL; + } + } + + /* fill left-down pixels */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { + int y; + const pel8_t *p_l = pL + bsy * i_TL; + + for (y = 0; y < bsy; y++) { + EP[-bsy - 1 - y] = *p_l; + p_l += i_TL; + } + } else { + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); + } + + /* fill top-left pixel */ + if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { + EP[0] = pTL[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { + EP[0] = pT[0]; + } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { + EP[0] = pL[0]; + } + + /* fill extra pixels */ + num_padding = bsx * 11 / 4 - bsy + 4; + if (num_padding > 0) { + g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 + } +} + +static +void fill_reference_samples10_xy_c(xavs2_t *h, const pel10_t *pTL, int i_TL, const pel10_t *pLcuEP, pel10_t *EP, uint32_t i_avai, int bsx, int bsy) { - const pel_t *pT = pTL + 1; - const pel_t *pL = pTL + i_TL; + const pel10_t *pT = pTL + 1; + const pel10_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ - g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); + g_funcs.mem_repeat_p(&EP[-(bsy << 1)], ((1 << h->param->input_sample_bit_depth) >> 1), ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -1803,12 +3444,12 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ - g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel10_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { - g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); + g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel10_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } @@ -1821,7 +3462,7 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { - const pel_t *p_l = pL; + const pel10_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { @@ -1833,7 +3474,7 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; - const pel_t *p_l = pL + bsy * i_TL; + const pel10_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; @@ -1867,65 +3508,118 @@ void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP /* --------------------------------------------------------------------------- */ -void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf) +void xavs2_intra_pred_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf) { #define ANG_X_OFFSET 3 #define ANG_XY_OFFSET 13 #define ANG_Y_OFFSET 25 int i; - intra_pred_t *ipred = pf->intraf; + if (param->input_sample_bit_depth == 8) { + intra8_pred_t *ipred = pf->intraf8; + + pf->fill_edge8_f[0] = fill_reference_samples8_0_c; + pf->fill_edge8_f[1] = fill_reference_samples8_x_c; + pf->fill_edge8_f[2] = fill_reference_samples8_y_c; + pf->fill_edge8_f[3] = fill_reference_samples8_xy_c; + ipred[DC_PRED ] = intra_pred_dc8_c; // 0 + ipred[PLANE_PRED] = intra_pred_plane8_c; // 1 + ipred[BI_PRED ] = intra_pred_bilinear8_c; // 2 + + for (i = ANG_X_OFFSET; i < VERT_PRED; i++) { + ipred[i ] = intra_pred_ang8_x_c; // 3 ~ 11 + } + ipred[VERT_PRED ] = intra_pred_ver8_c; // 12 + + for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) { + ipred[i ] = intra_pred_ang8_xy_c; // 13 ~ 23 + } + + ipred[HOR_PRED ] = intra_pred_hor8_c; // 24 + for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) { + ipred[i ] = intra_pred_ang8_y_c; // 25 ~ 32 + } + + ipred[INTRA_ANG_X_3 ] = intra_pred_ang8_x_3_c; + ipred[INTRA_ANG_X_4 ] = intra_pred_ang8_x_4_c; + ipred[INTRA_ANG_X_5 ] = intra_pred_ang8_x_5_c; + ipred[INTRA_ANG_X_6 ] = intra_pred_ang8_x_6_c; + ipred[INTRA_ANG_X_7 ] = intra_pred_ang8_x_7_c; + ipred[INTRA_ANG_X_8 ] = intra_pred_ang8_x_8_c; + ipred[INTRA_ANG_X_9 ] = intra_pred_ang8_x_9_c; + ipred[INTRA_ANG_X_10] = intra_pred_ang8_x_10_c; + ipred[INTRA_ANG_X_11] = intra_pred_ang8_x_11_c; + + ipred[INTRA_ANG_XY_13] = intra_pred_ang8_xy_13_c; + ipred[INTRA_ANG_XY_14] = intra_pred_ang8_xy_14_c; + ipred[INTRA_ANG_XY_16] = intra_pred_ang8_xy_16_c; + ipred[INTRA_ANG_XY_18] = intra_pred_ang8_xy_18_c; + ipred[INTRA_ANG_XY_20] = intra_pred_ang8_xy_20_c; + ipred[INTRA_ANG_XY_22] = intra_pred_ang8_xy_22_c; + ipred[INTRA_ANG_XY_23] = intra_pred_ang8_xy_23_c; + + ipred[INTRA_ANG_Y_25] = intra_pred_ang8_y_25_c; + ipred[INTRA_ANG_Y_26] = intra_pred_ang8_y_26_c; + ipred[INTRA_ANG_Y_27] = intra_pred_ang8_y_27_c; + ipred[INTRA_ANG_Y_28] = intra_pred_ang8_y_28_c; + ipred[INTRA_ANG_Y_29] = intra_pred_ang8_y_29_c; + ipred[INTRA_ANG_Y_30] = intra_pred_ang8_y_30_c; + ipred[INTRA_ANG_Y_31] = intra_pred_ang8_y_31_c; + ipred[INTRA_ANG_Y_32] = intra_pred_ang8_y_32_c; + } else { + intra10_pred_t *ipred = pf->intraf10; - pf->fill_edge_f[0] = fill_reference_samples_0_c; - pf->fill_edge_f[1] = fill_reference_samples_x_c; - pf->fill_edge_f[2] = fill_reference_samples_y_c; - pf->fill_edge_f[3] = fill_reference_samples_xy_c; - ipred[DC_PRED ] = intra_pred_dc_c; // 0 - ipred[PLANE_PRED] = intra_pred_plane_c; // 1 - ipred[BI_PRED ] = intra_pred_bilinear_c; // 2 + pf->fill_edge10_f[0] = fill_reference_samples10_0_c; + pf->fill_edge10_f[1] = fill_reference_samples10_x_c; + pf->fill_edge10_f[2] = fill_reference_samples10_y_c; + pf->fill_edge10_f[3] = fill_reference_samples10_xy_c; + ipred[DC_PRED ] = intra_pred_dc10_c; // 0 + ipred[PLANE_PRED] = intra_pred_plane10_c; // 1 + ipred[BI_PRED ] = intra_pred_bilinear10_c; // 2 for (i = ANG_X_OFFSET; i < VERT_PRED; i++) { - ipred[i ] = intra_pred_ang_x_c; // 3 ~ 11 + ipred[i ] = intra_pred_ang10_x_c; // 3 ~ 11 } - ipred[VERT_PRED ] = intra_pred_ver_c; // 12 + ipred[VERT_PRED ] = intra_pred_ver10_c; // 12 for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) { - ipred[i ] = intra_pred_ang_xy_c; // 13 ~ 23 + ipred[i ] = intra_pred_ang10_xy_c; // 13 ~ 23 } - ipred[HOR_PRED ] = intra_pred_hor_c; // 24 + ipred[HOR_PRED ] = intra_pred_hor10_c; // 24 for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) { - ipred[i ] = intra_pred_ang_y_c; // 25 ~ 32 - } - - ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_c; - ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_c; - ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_c; - ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_c; - ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_c; - ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_c; - ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_c; - ipred[INTRA_ANG_X_10] = intra_pred_ang_x_10_c; - ipred[INTRA_ANG_X_11] = intra_pred_ang_x_11_c; - - ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_c; - ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_c; - ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_c; - ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_c; - ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_c; - ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_c; - ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_c; - - ipred[INTRA_ANG_Y_25] = intra_pred_ang_y_25_c; - ipred[INTRA_ANG_Y_26] = intra_pred_ang_y_26_c; - ipred[INTRA_ANG_Y_27] = intra_pred_ang_y_27_c; - ipred[INTRA_ANG_Y_28] = intra_pred_ang_y_28_c; - ipred[INTRA_ANG_Y_29] = intra_pred_ang_y_29_c; - ipred[INTRA_ANG_Y_30] = intra_pred_ang_y_30_c; - ipred[INTRA_ANG_Y_31] = intra_pred_ang_y_31_c; - ipred[INTRA_ANG_Y_32] = intra_pred_ang_y_32_c; - - // TODO: 8bit情况下角度7、9、11性能不一致 20170716 + ipred[i ] = intra_pred_ang10_y_c; // 25 ~ 32 + } + + ipred[INTRA_ANG_X_3 ] = intra_pred_ang10_x_3_c; + ipred[INTRA_ANG_X_4 ] = intra_pred_ang10_x_4_c; + ipred[INTRA_ANG_X_5 ] = intra_pred_ang10_x_5_c; + ipred[INTRA_ANG_X_6 ] = intra_pred_ang10_x_6_c; + ipred[INTRA_ANG_X_7 ] = intra_pred_ang10_x_7_c; + ipred[INTRA_ANG_X_8 ] = intra_pred_ang10_x_8_c; + ipred[INTRA_ANG_X_9 ] = intra_pred_ang10_x_9_c; + ipred[INTRA_ANG_X_10] = intra_pred_ang10_x_10_c; + ipred[INTRA_ANG_X_11] = intra_pred_ang10_x_11_c; + + ipred[INTRA_ANG_XY_13] = intra_pred_ang10_xy_13_c; + ipred[INTRA_ANG_XY_14] = intra_pred_ang10_xy_14_c; + ipred[INTRA_ANG_XY_16] = intra_pred_ang10_xy_16_c; + ipred[INTRA_ANG_XY_18] = intra_pred_ang10_xy_18_c; + ipred[INTRA_ANG_XY_20] = intra_pred_ang10_xy_20_c; + ipred[INTRA_ANG_XY_22] = intra_pred_ang10_xy_22_c; + ipred[INTRA_ANG_XY_23] = intra_pred_ang10_xy_23_c; + + ipred[INTRA_ANG_Y_25] = intra_pred_ang10_y_25_c; + ipred[INTRA_ANG_Y_26] = intra_pred_ang10_y_26_c; + ipred[INTRA_ANG_Y_27] = intra_pred_ang10_y_27_c; + ipred[INTRA_ANG_Y_28] = intra_pred_ang10_y_28_c; + ipred[INTRA_ANG_Y_29] = intra_pred_ang10_y_29_c; + ipred[INTRA_ANG_Y_30] = intra_pred_ang10_y_30_c; + ipred[INTRA_ANG_Y_31] = intra_pred_ang10_y_31_c; + ipred[INTRA_ANG_Y_32] = intra_pred_ang10_y_32_c; + } + + // TODO: 8bit鎯呭喌涓嬭搴7銆9銆11鎬ц兘涓嶄竴鑷 20170716 #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { ipred[DC_PRED ] = intra_pred_dc_sse128; @@ -1933,6 +3627,7 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf) ipred[VERT_PRED ] = intra_pred_ver_sse128; ipred[PLANE_PRED ] = intra_pred_plane_sse128; ipred[BI_PRED ] = intra_pred_bilinear_sse128; +#if !HIGH_BIT_DEPTH ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_sse128; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_sse128; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_sse128; @@ -1959,9 +3654,11 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf) pf->fill_edge_f[1] = fill_edge_samples_x_sse128; pf->fill_edge_f[2] = fill_edge_samples_y_sse128; pf->fill_edge_f[3] = fill_edge_samples_xy_sse128; +#endif } /* 8/10bit assemble*/ +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { ipred[DC_PRED ] = intra_pred_dc_avx; ipred[HOR_PRED ] = intra_pred_hor_avx; @@ -1994,8 +3691,8 @@ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf) ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_avx; ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_avx; ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_avx; - } +#endif #endif //if HAVE_MMX #undef ANG_X_OFFSET #undef ANG_XY_OFFSET diff --git a/source/common/intra.h b/source/common/intra.h index a7c4a6a..dff0973 100644 --- a/source/common/intra.h +++ b/source/common/intra.h @@ -46,24 +46,44 @@ uint32_t xavs2_intra_get_cu_neighbors(xavs2_t *h, cu_t *p_cu, int img_x, int img void xavs2_intra_fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy); -#define rdo_get_pred_intra_luma FPFX(rdo_get_pred_intra_luma) -int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +#define rdo_get_pred_intra_luma8 FPFX(rdo_get_pred_intra_luma8) +int rdo_get_pred_intra_luma8(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); -#define rdo_get_pred_intra_luma_rmd FPFX(rdo_get_pred_intra_luma_rmd) -int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +#define rdo_get_pred_intra_luma10 FPFX(rdo_get_pred_intra_luma10) +int rdo_get_pred_intra_luma10(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + +#define rdo_get_pred_intra_luma8_rmd FPFX(rdo_get_pred_intra_luma8_rmd) +int rdo_get_pred_intra_luma8_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + +#define rdo_get_pred_intra_luma10_rmd FPFX(rdo_get_pred_intra_luma10_rmd) +int rdo_get_pred_intra_luma10_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); -#define rdo_get_pred_intra_luma_cuda FPFX(rdo_get_pred_intra_luma_cuda) -int rdo_get_pred_intra_luma_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +#define rdo_get_pred_intra_luma8_cuda FPFX(rdo_get_pred_intra_luma8_cuda) +int rdo_get_pred_intra_luma8_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); -#define rdo_get_pred_intra_luma_2nd_pass FPFX(rdo_get_pred_intra_luma_2nd_pass) -int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +#define rdo_get_pred_intra_luma10_cuda FPFX(rdo_get_pred_intra_luma10_cuda) +int rdo_get_pred_intra_luma10_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + +#define rdo_get_pred_intra_luma8_2nd_pass FPFX(rdo_get_pred_intra_luma8_2nd_pass) +int rdo_get_pred_intra_luma8_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h); + +#define rdo_get_pred_intra_luma10_2nd_pass FPFX(rdo_get_pred_intra_luma10_2nd_pass) +int rdo_get_pred_intra_luma10_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); #define rdo_get_pred_intra_chroma FPFX(rdo_get_pred_intra_chroma) diff --git a/source/common/mc.c b/source/common/mc.c index c1da03e..7bba8e5 100644 --- a/source/common/mc.c +++ b/source/common/mc.c @@ -167,10 +167,20 @@ enum intpl_pos_e { /* --------------------------------------------------------------------------- */ static void -mc_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) +mc_copy8_c(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h) { while (h--) { - memcpy(dst, src, w * sizeof(pel_t)); + memcpy(dst, src, w * sizeof(pel8_t)); + dst += i_dst; + src += i_src; + } +} + +static void +mc_copy10_c(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h) +{ + while (h--) { + memcpy(dst, src, w * sizeof(pel10_t)); dst += i_dst; src += i_src; } @@ -180,19 +190,29 @@ mc_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) * plane copy */ static void -plane_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) +plane_copy8_c(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h) +{ + while (h--) { + memcpy(dst, src, w * sizeof(pel8_t)); + dst += i_dst; + src += i_src; + } +} + +static void +plane_copy10_c(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h) { while (h--) { - memcpy(dst, src, w * sizeof(pel_t)); + memcpy(dst, src, w * sizeof(pel10_t)); dst += i_dst; src += i_src; } } -#define PLANE_COPY(align, cpu) \ -void plane_copy_##cpu(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)\ +#define PLANE_COPY8(align, cpu) \ +void plane_copy8_##cpu(pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h)\ {\ - int c_w = (align) / sizeof(pel_t) - 1;\ + int c_w = (align) / sizeof(pel8_t) - 1;\ if (w < 256) { /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ plane_copy_c( dst, i_dst, src, i_src, w, h );\ } else if (!(w & c_w)) {\ @@ -208,19 +228,56 @@ void plane_copy_##cpu(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, in }\ }\ /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ - memcpy( dst, src, w*sizeof(pel_t) );\ + memcpy( dst, src, w*sizeof(pel8_t) );\ + }\ +} + +#define PLANE_COPY10(align, cpu) \ +void plane_copy10_##cpu(pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h)\ +{\ + int c_w = (align) / sizeof(pel10_t) - 1;\ + if (w < 256) { /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ + plane_copy_c( dst, i_dst, src, i_src, w, h );\ + } else if (!(w & c_w)) {\ + xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ + } else {\ + if (--h > 0) {\ + if( i_src > 0 ) {\ + xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + } else {\ + xavs2_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + }\ + /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ + memcpy( dst, src, w*sizeof(pel10_t) );\ }\ } #if HAVE_MMX -PLANE_COPY(16, mmx2) +PLANE_COPY8(16, mmx2) +PLANE_COPY10(16, mmx2) #endif /* --------------------------------------------------------------------------- * deinterleave copy, for chroma planes */ static void -plane_copy_deinterleave_c(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h) +plane_copy8_deinterleave_c(xavs2_t *bb, pel8_t *dstu, intptr_t i_dstu, pel8_t *dstv, intptr_t i_dstv, pel8_t *src, intptr_t i_src, int w, int h) +{ + int x, y; + + for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) { + for (x = 0; x < w; x++) { + dstu[x] = src[2*x ]; + dstv[x] = src[2*x + 1]; + } + } +} + +static void +plane_copy10_deinterleave_c(xavs2_t *bb, pel10_t *dstu, intptr_t i_dstu, pel10_t *dstv, intptr_t i_dstv, pel10_t *src, intptr_t i_src, int w, int h) { int x, y; @@ -272,48 +329,92 @@ void mem_repeat_8i_c(void *dst, int val, size_t count) /* --------------------------------------------------------------------------- */ static void -intpl_chroma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +intpl_chroma8_block_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6; + dst[x] = (pel8_t)XAVS2_CLIP1(v); + } + src += i_src; + dst += i_dst; + } +#undef XAVS2_CLIP1 +} + +static void +intpl_chroma10_block_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel10_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_chroma_block_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +intpl_chroma8_block_ver_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6; + dst[x] = (pel8_t)XAVS2_CLIP1(v); + } + src += i_src; + dst += i_dst; + } +#undef XAVS2_CLIP1 +} + +static void +intpl_chroma10_block_ver_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel10_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) +intpl_chroma8_block_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + ALIGN16(int32_t tmp_res[(32 + 3) * 32]); int32_t *tmp = tmp_res; - const int shift1 = g_bit_depth - 8; + const int shift1 = h->param->input_sample_bit_depth - 8; const int add1 = (1 << shift1) >> 1; - const int shift2 = 20 - g_bit_depth; - const int add2 = 1 << (shift2 - 1); // 1<<(19-g_bit_depth) + const int shift2 = 20 - h->param->input_sample_bit_depth; + const int add2 = 1 << (shift2 - 1); // 1<<(19-h->param->input_sample_bit_depth) int x, y, v; src -= i_src; @@ -329,45 +430,103 @@ intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel8_t)XAVS2_CLIP1(v); } dst += i_dst; tmp += 32; } +#undef XAVS2_CLIP1 +} + +static void +intpl_chroma10_block_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + ALIGN16(int32_t tmp_res[(32 + 3) * 32]); + int32_t *tmp = tmp_res; + const int shift1 = h->param->input_sample_bit_depth - 8; + const int add1 = (1 << shift1) >> 1; + const int shift2 = 20 - h->param->input_sample_bit_depth; + const int add2 = 1 << (shift2 - 1); // 1<<(19-h->param->input_sample_bit_depth) + int x, y, v; + + src -= i_src; + for (y = -1; y < height + 2; y++) { + for (x = 0; x < width; x++) { + v = FLT_4TAP_HOR(src, x, coeff_h); + tmp[x] = (v + add1) >> shift1; + } + src += i_src; + tmp += 32; + } + tmp = tmp_res + 32; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2; + dst[x] = (pel10_t)XAVS2_CLIP1(v); + } + dst += i_dst; + tmp += 32; + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +intpl_luma8_block_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel8_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_block_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6; + dst[x] = (pel10_t)XAVS2_CLIP1(v); + } + src += i_src; + dst += i_dst; + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ -#define intpl_luma_block_ver_c intpl_luma_ver_c +#define intpl_luma8_block_ver_c intpl_luma8_ver_c +#define intpl_luma10_block_ver_c intpl_luma10_ver_c /* --------------------------------------------------------------------------- */ static void -intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) +intpl_luma8_block_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) #define TMP_STRIDE 64 - const int shift1 = g_bit_depth - 8; + const int shift1 = h->param->input_sample_bit_depth - 8; const int add1 = (1 << shift1) >> 1; - const int shift2 = 20 - g_bit_depth; - const int add2 = 1 << (shift2 - 1);//1<<(19-bit_depth) + const int shift2 = 20 - h->param->input_sample_bit_depth; + const int add2 = 1 << (shift2 - 1);//1<<(19-h->input_sample_bit_depth) ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]); mct_t *tmp = tmp_buf; @@ -387,7 +546,7 @@ intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel8_t)XAVS2_CLIP1(v); } dst += i_dst; @@ -395,80 +554,198 @@ intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, } #undef TMP_STRIDE +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_block_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) +#define TMP_STRIDE 64 + + const int shift1 = h->param->input_sample_bit_depth - 8; + const int add1 = (1 << shift1) >> 1; + const int shift2 = 20 - h->param->input_sample_bit_depth; + const int add2 = 1 << (shift2 - 1);//1<<(19-h->input_sample_bit_depth) + + ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]); + mct_t *tmp = tmp_buf; + int x, y, v; + + src -= 3 * i_src; + for (y = -3; y < height + 4; y++) { + for (x = 0; x < width; x++) { + v = FLT_8TAP_HOR(src, x, coeff_h); + tmp[x] = (mct_t)((v + add1) >> shift1); + } + src += i_src; + tmp += TMP_STRIDE; + } + + tmp = tmp_buf + 3 * TMP_STRIDE; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2; + dst[x] = (pel10_t)XAVS2_CLIP1(v); + } + + dst += i_dst; + tmp += TMP_STRIDE; + } + +#undef TMP_STRIDE +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_hor_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +intpl_luma8_hor_c(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel8_t *src, int i_src, int width, int height, int8_t const *coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff); tmp[x] = (mct_t)v; - dst[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; tmp += i_tmp; dst += i_dst; } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_hor_c(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel10_t *src, int i_src, int width, int height, int8_t const *coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = FLT_8TAP_HOR(src, x, coeff); + tmp[x] = (mct_t)v; + dst[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); + } + src += i_src; + tmp += i_tmp; + dst += i_dst; + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +intpl_luma8_ver_c(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v = FLT_8TAP_VER(src, x, i_src, coeff); v = (v + 32) >> 6; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel8_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_ver_c(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + int v = FLT_8TAP_VER(src, x, i_src, coeff); + v = (v + 32) >> 6; + dst[x] = (pel10_t)XAVS2_CLIP1(v); + } + src += i_src; + dst += i_dst; + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_ver_x3_c(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) +intpl_luma8_ver_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const **coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + pel8_t *dst0 = dst[0]; + pel8_t *dst1 = dst[1]; + pel8_t *dst2 = dst[2]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + v = FLT_8TAP_VER(src, x, i_src, coeff[0]); + dst0[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); + v = FLT_8TAP_VER(src, x, i_src, coeff[1]); + dst1[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); + v = FLT_8TAP_VER(src, x, i_src, coeff[2]); + dst2[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); + } + src += i_src; + dst0 += i_dst; + dst1 += i_dst; + dst2 += i_dst; + } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_ver_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const **coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; - pel_t *dst0 = dst[0]; - pel_t *dst1 = dst[1]; - pel_t *dst2 = dst[2]; + pel10_t *dst0 = dst[0]; + pel10_t *dst1 = dst[1]; + pel10_t *dst2 = dst[2]; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_VER(src, x, i_src, coeff[0]); - dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst0[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_VER(src, x, i_src, coeff[1]); - dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst1[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_VER(src, x, i_src, coeff[2]); - dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst2[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) +intpl_luma8_hor_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t **coeff) { +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + int x, y, v; - pel_t *dst0 = dst[0]; - pel_t *dst1 = dst[1]; - pel_t *dst2 = dst[2]; + pel8_t *dst0 = dst[0]; + pel8_t *dst1 = dst[1]; + pel8_t *dst2 = dst[2]; mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; @@ -477,13 +754,13 @@ intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_t for(x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff[0]); tmp0[x] = (mct_t)v; - dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst0[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_HOR(src, x, coeff[1]); tmp1[x] = (mct_t)v; - dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst1[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_HOR(src, x, coeff[2]); tmp2[x] = (mct_t)v; - dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); + dst2[x] = (pel8_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; tmp0 += i_tmp; @@ -493,57 +770,157 @@ intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_t dst1 += i_dst; dst2 += i_dst; } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_hor_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t **coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + int x, y, v; + pel10_t *dst0 = dst[0]; + pel10_t *dst1 = dst[1]; + pel10_t *dst2 = dst[2]; + mct_t *tmp0 = tmp[0]; + mct_t *tmp1 = tmp[1]; + mct_t *tmp2 = tmp[2]; + + for (y = 0; y < height; y++) { + for(x = 0; x < width; x++) { + v = FLT_8TAP_HOR(src, x, coeff[0]); + tmp0[x] = (mct_t)v; + dst0[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); + v = FLT_8TAP_HOR(src, x, coeff[1]); + tmp1[x] = (mct_t)v; + dst1[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); + v = FLT_8TAP_HOR(src, x, coeff[2]); + tmp2[x] = (mct_t)v; + dst2[x] = (pel10_t)XAVS2_CLIP1((v + 32) >> 6); + } + src += i_src; + tmp0 += i_tmp; + tmp1 += i_tmp; + tmp2 += i_tmp; + dst0 += i_dst; + dst1 += i_dst; + dst2 += i_dst; + } +#undef XAVS2_CLIP1 } /* --------------------------------------------------------------------------- */ static void -intpl_luma_ext_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) +intpl_luma8_ext_c(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { - const int MC_SHIFT = 20 - g_bit_depth; - const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-g_bit_depth)) +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + const int MC_SHIFT = 20 - h->param->input_sample_bit_depth; + const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-h->param->input_sample_bit_depth)) int x, y; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v = FLT_8TAP_VER(tmp, x, i_tmp, coeff); v = (v + MC_ADD) >> MC_SHIFT; - dst[x] = (pel_t)XAVS2_CLIP1(v); + dst[x] = (pel8_t)XAVS2_CLIP1(v); } dst += i_dst; tmp += i_tmp; } +#undef XAVS2_CLIP1 } static void -intpl_luma_ext_x3_c(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) +intpl_luma10_ext_c(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { - const int MC_SHIFT = 20 - g_bit_depth; - const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-g_bit_depth)) +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + const int MC_SHIFT = 20 - h->param->input_sample_bit_depth; + const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-h->param->input_sample_bit_depth)) int x, y; - pel_t *dst0 = dst[0]; - pel_t *dst1 = dst[1]; - pel_t *dst2 = dst[2]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + int v = FLT_8TAP_VER(tmp, x, i_tmp, coeff); + v = (v + MC_ADD) >> MC_SHIFT; + dst[x] = (pel10_t)XAVS2_CLIP1(v); + } + dst += i_dst; + tmp += i_tmp; + } +#undef XAVS2_CLIP1 +} + +/* --------------------------------------------------------------------------- + */ +static void +intpl_luma8_ext_x3_c(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + const int MC_SHIFT = 20 - h->param->input_sample_bit_depth; + const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-h->param->input_sample_bit_depth)) + int x, y; + + pel8_t *dst0 = dst[0]; + pel8_t *dst1 = dst[1]; + pel8_t *dst2 = dst[2]; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v; v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[0]); v = (v + MC_ADD) >> MC_SHIFT; - dst0[x] = (pel_t)XAVS2_CLIP1(v); + dst0[x] = (pel8_t)XAVS2_CLIP1(v); v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[1]); v = (v + MC_ADD) >> MC_SHIFT; - dst1[x] = (pel_t)XAVS2_CLIP1(v); + dst1[x] = (pel8_t)XAVS2_CLIP1(v); v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[2]); v = (v + MC_ADD) >> MC_SHIFT; - dst2[x] = (pel_t)XAVS2_CLIP1(v); + dst2[x] = (pel8_t)XAVS2_CLIP1(v); } dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; tmp += i_tmp; } +#undef XAVS2_CLIP1 +} + +static void +intpl_luma10_ext_x3_c(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) +{ +#define XAVS2_CLIP1(a) ((a) > ((1 << h->param->input_sample_bit_depth) - 1) ? ((1 << h->param->input_sample_bit_depth) - 1) : ((a) < 0 ? 0 : (a))) + + const int MC_SHIFT = 20 - h->param->input_sample_bit_depth; + const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-h->param->input_sample_bit_depth)) + int x, y; + + pel10_t *dst0 = dst[0]; + pel10_t *dst1 = dst[1]; + pel10_t *dst2 = dst[2]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + int v; + v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[0]); + v = (v + MC_ADD) >> MC_SHIFT; + dst0[x] = (pel10_t)XAVS2_CLIP1(v); + v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[1]); + v = (v + MC_ADD) >> MC_SHIFT; + dst1[x] = (pel10_t)XAVS2_CLIP1(v); + v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[2]); + v = (v + MC_ADD) >> MC_SHIFT; + dst2[x] = (pel10_t)XAVS2_CLIP1(v); + } + dst0 += i_dst; + dst1 += i_dst; + dst2 += i_dst; + tmp += i_tmp; + } +#undef XAVS2_CLIP1 } /** @@ -556,7 +933,7 @@ intpl_luma_ext_x3_c(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int w * predict one component of a luma block * ref_idx - reference frame (0.. / -1:backward) */ -void mc_luma(pel_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y, +void mc_luma8(xavs2_t *h, pel8_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm) { int x = (pix_quad_x >> 2); @@ -564,24 +941,56 @@ void mc_luma(pel_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y, int dx = pix_quad_x & 3; int dy = pix_quad_y & 3; int i_src = p_ref_frm->i_stride[0]; - pel_t *src = p_ref_frm->filtered[(dy << 2) + dx]; + pel8_t *src = p_ref_frm->filtered8[(dy << 2) + dx]; /* fetch prediction result */ #if ENABLE_FRAME_SUBPEL_INTPL if (src != NULL) { src += y * i_src + x; - g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); + g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); } else { #endif - src = p_ref_frm->filtered[0] + y * i_src + x; + src = p_ref_frm->filtered8[0] + y * i_src + x; if (dx == 0 && dy == 0) { - g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); + g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); } else if (dy == 0) { - g_funcs.intpl_luma_block_hor(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]); + g_funcs.intpl_luma8_block_hor(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]); } else if (dx == 0) { - g_funcs.intpl_luma_block_ver(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]); + g_funcs.intpl_luma8_block_ver(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]); } else { - g_funcs.intpl_luma_block_ext(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]); + g_funcs.intpl_luma8_block_ext(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]); + } +#if ENABLE_FRAME_SUBPEL_INTPL + } +#endif +} + +void mc_luma10(xavs2_t *h, pel10_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y, + int width, int height, const xavs2_frame_t *p_ref_frm) +{ + int x = (pix_quad_x >> 2); + int y = (pix_quad_y >> 2); + int dx = pix_quad_x & 3; + int dy = pix_quad_y & 3; + int i_src = p_ref_frm->i_stride[0]; + pel10_t *src = p_ref_frm->filtered10[(dy << 2) + dx]; + + /* fetch prediction result */ +#if ENABLE_FRAME_SUBPEL_INTPL + if (src != NULL) { + src += y * i_src + x; + g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); + } else { +#endif + src = p_ref_frm->filtered10[0] + y * i_src + x; + if (dx == 0 && dy == 0) { + g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); + } else if (dy == 0) { + g_funcs.intpl_luma10_block_hor(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]); + } else if (dx == 0) { + g_funcs.intpl_luma10_block_ver(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]); + } else { + g_funcs.intpl_luma10_block_ext(h, p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]); } #if ENABLE_FRAME_SUBPEL_INTPL } @@ -596,10 +1005,7 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he int i_tmp = frm->i_width[IMG_Y] + 2 * XAVS2_PAD; int width = frm->i_width[IMG_Y] + 2 * PAD_OFFSET; int off_dst = start_y * stride - PAD_OFFSET; - pel_t *src = frm->planes[IMG_Y] + off_dst; // reconstructed luma plane - pel_t *p_dst[3]; const int8_t *p_coeffs[3]; - pel_t *dst; mct_t *intpl_tmp[3]; /* ------------------------------------------------------------- @@ -633,27 +1039,150 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he /* ------------------------------------------------------------- * interpolate horizontal positions: a.b,c */ + + if (h->param->input_sample_bit_depth == 8) { + pel8_t *src = frm->planes8[IMG_Y] + off_dst; // reconstructed luma plane + pel8_t *p_dst[3]; + pel8_t *dst; + { - const int shift_h = 4; // 往上偏移4行重新插值以并行 + const int shift_h = 4; // 寰涓婂亸绉4琛岄噸鏂版彃鍊间互骞惰 intpl_tmp[0] -= shift_h * i_tmp; intpl_tmp[1] -= shift_h * i_tmp; intpl_tmp[2] -= shift_h * i_tmp; src -= shift_h * stride; if (h->use_fractional_me > 1) { - p_dst[0] = frm->filtered[INTPL_POS_A] + off_dst - shift_h * stride; // a + p_dst[0] = frm->filtered8[INTPL_POS_A] + off_dst - shift_h * stride; // a p_coeffs[0] = INTPL_FILTERS[INTPL_POS_A]; // a - p_dst[1] = frm->filtered[INTPL_POS_B] + off_dst - shift_h * stride; // b + p_dst[1] = frm->filtered8[INTPL_POS_B] + off_dst - shift_h * stride; // b p_coeffs[1] = INTPL_FILTERS[INTPL_POS_B]; // b - p_dst[2] = frm->filtered[INTPL_POS_C] + off_dst - shift_h * stride; // c + p_dst[2] = frm->filtered8[INTPL_POS_C] + off_dst - shift_h * stride; // c p_coeffs[2] = INTPL_FILTERS[INTPL_POS_C]; // c - g_funcs.intpl_luma_hor_x3(p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs); + g_funcs.intpl_luma8_hor_x3(h, p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs); + } else { + // b + dst = frm->filtered8[INTPL_POS_B] + off_dst - 4 * stride; + + g_funcs.intpl_luma8_hor(h, dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]); + } + src += shift_h * stride; + intpl_tmp[0] += shift_h * i_tmp; + intpl_tmp[1] += shift_h * i_tmp; + intpl_tmp[2] += shift_h * i_tmp; + } + + /* ------------------------------------------------------------- + * interpolate vertical positions: d,h,n */ + if (h->use_fractional_me > 1) { + p_dst[0] = frm->filtered8[INTPL_POS_D] + off_dst; // d + p_coeffs[0] = INTPL_FILTERS[INTPL_POS_D >> 2]; // d + + p_dst[1] = frm->filtered8[INTPL_POS_H] + off_dst; // h + p_coeffs[1] = INTPL_FILTERS[INTPL_POS_H >> 2]; // h + + p_dst[2] = frm->filtered8[INTPL_POS_N] + off_dst; // n + p_coeffs[2] = INTPL_FILTERS[INTPL_POS_N >> 2]; // n + + g_funcs.intpl_luma8_ver_x3(h, p_dst, stride, src, stride, width, height, p_coeffs); + } else { + p_dst[1] = frm->filtered8[INTPL_POS_H] + off_dst; // h + + g_funcs.intpl_luma8_ver(h, p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]); + } + + /* ------------------------------------------------------------- + * interpolate tilt positions: [e,f,g; i,j,k; p,q,r] */ + if (h->use_fractional_me > 1) { + // --- for e,i,p --- + p_dst[0] = frm->filtered8[INTPL_POS_E] + off_dst; // e + p_coeffs[0] = INTPL_FILTERS[INTPL_POS_E >> 2]; // e + + p_dst[1] = frm->filtered8[INTPL_POS_I] + off_dst; // i + p_coeffs[1] = INTPL_FILTERS[INTPL_POS_I >> 2]; // i + + p_dst[2] = frm->filtered8[INTPL_POS_P] + off_dst; // p + p_coeffs[2] = INTPL_FILTERS[INTPL_POS_P >> 2]; // p + + g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs); + + // --- for f,j,q --- + p_dst[0] = frm->filtered8[INTPL_POS_F] + off_dst; // f + p_coeffs[0] = INTPL_FILTERS[INTPL_POS_F >> 2]; // f + + p_dst[1] = frm->filtered8[INTPL_POS_J] + off_dst; // j + p_coeffs[1] = INTPL_FILTERS[INTPL_POS_J >> 2]; // j + + p_dst[2] = frm->filtered8[INTPL_POS_Q] + off_dst; // q + p_coeffs[2] = INTPL_FILTERS[INTPL_POS_Q >> 2]; // q + + g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs); + + // --- for g,k,r --- + p_dst[0] = frm->filtered8[INTPL_POS_G] + off_dst; // g + p_coeffs[0] = INTPL_FILTERS[INTPL_POS_G >> 2]; // g + + p_dst[1] = frm->filtered8[INTPL_POS_K] + off_dst; // k + p_coeffs[1] = INTPL_FILTERS[INTPL_POS_K >> 2]; // k + + p_dst[2] = frm->filtered8[INTPL_POS_R] + off_dst; // r + p_coeffs[2] = INTPL_FILTERS[INTPL_POS_R >> 2]; // r + + g_funcs.intpl_luma8_ext_x3(h, p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs); + } else { + // j + dst = frm->filtered8[INTPL_POS_J] + off_dst; + + g_funcs.intpl_luma8_ext(h, dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]); + } + + /* --------------------------------------------------------------------------- + * expand border for all 15 filtered planes */ + { + const int padh = XAVS2_PAD - PAD_OFFSET; + const int padv = XAVS2_PAD - PAD_OFFSET; + int i; + + width = frm->i_width[IMG_Y] + PAD_OFFSET * 2; + + /* loop over all 15 filtered planes */ + for (i = 1; i < 16; i++) { + pel8_t *pix = frm->filtered8[i]; + if (pix != NULL) { + pix += start_y * stride - PAD_OFFSET; + plane_expand_border8(pix, stride, width, height, padh, padv, b_start, b_end); + } + } + } + } else { + pel10_t *src = frm->planes10[IMG_Y] + off_dst; // reconstructed luma plane + pel10_t *p_dst[3]; + pel10_t *dst; + + { + const int shift_h = 4; // 寰涓婂亸绉4琛岄噸鏂版彃鍊间互骞惰 + intpl_tmp[0] -= shift_h * i_tmp; + intpl_tmp[1] -= shift_h * i_tmp; + intpl_tmp[2] -= shift_h * i_tmp; + src -= shift_h * stride; + if (h->use_fractional_me > 1) { + p_dst[0] = frm->filtered10[INTPL_POS_A] + off_dst - shift_h * stride; // a + p_coeffs[0] = INTPL_FILTERS[INTPL_POS_A]; // a + + p_dst[1] = frm->filtered10[INTPL_POS_B] + off_dst - shift_h * stride; // b + p_coeffs[1] = INTPL_FILTERS[INTPL_POS_B]; // b + + p_dst[2] = frm->filtered10[INTPL_POS_C] + off_dst - shift_h * stride; // c + p_coeffs[2] = INTPL_FILTERS[INTPL_POS_C]; + + g_funcs.intpl_luma10_hor_x3(h, p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs); } else { // b - dst = frm->filtered[INTPL_POS_B] + off_dst - 4 * stride; - g_funcs.intpl_luma_hor(dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]); + dst = frm->filtered10[INTPL_POS_B] + off_dst - 4 * stride; + + g_funcs.intpl_luma10_hor(h, dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]); } src += shift_h * stride; intpl_tmp[0] += shift_h * i_tmp; @@ -664,63 +1193,65 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he /* ------------------------------------------------------------- * interpolate vertical positions: d,h,n */ if (h->use_fractional_me > 1) { - p_dst[0] = frm->filtered[INTPL_POS_D] + off_dst; // d + p_dst[0] = frm->filtered10[INTPL_POS_D] + off_dst; // d p_coeffs[0] = INTPL_FILTERS[INTPL_POS_D >> 2]; // d - p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst; // h + p_dst[1] = frm->filtered10[INTPL_POS_H] + off_dst; // h p_coeffs[1] = INTPL_FILTERS[INTPL_POS_H >> 2]; // h - p_dst[2] = frm->filtered[INTPL_POS_N] + off_dst; // n + p_dst[2] = frm->filtered10[INTPL_POS_N] + off_dst; // n p_coeffs[2] = INTPL_FILTERS[INTPL_POS_N >> 2]; // n - g_funcs.intpl_luma_ver_x3(p_dst, stride, src, stride, width, height, p_coeffs); + g_funcs.intpl_luma10_ver_x3(h, p_dst, stride, src, stride, width, height, p_coeffs); } else { - p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst; // h - g_funcs.intpl_luma_ver(p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]); + p_dst[1] = frm->filtered10[INTPL_POS_H] + off_dst; // h + + g_funcs.intpl_luma10_ver(h, p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]); } /* ------------------------------------------------------------- * interpolate tilt positions: [e,f,g; i,j,k; p,q,r] */ if (h->use_fractional_me > 1) { // --- for e,i,p --- - p_dst[0] = frm->filtered[INTPL_POS_E] + off_dst; // e + p_dst[0] = frm->filtered10[INTPL_POS_E] + off_dst; // e p_coeffs[0] = INTPL_FILTERS[INTPL_POS_E >> 2]; // e - p_dst[1] = frm->filtered[INTPL_POS_I] + off_dst; // i + p_dst[1] = frm->filtered10[INTPL_POS_I] + off_dst; // i p_coeffs[1] = INTPL_FILTERS[INTPL_POS_I >> 2]; // i - p_dst[2] = frm->filtered[INTPL_POS_P] + off_dst; // p + p_dst[2] = frm->filtered10[INTPL_POS_P] + off_dst; // p p_coeffs[2] = INTPL_FILTERS[INTPL_POS_P >> 2]; // p - g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs); + g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs); // --- for f,j,q --- - p_dst[0] = frm->filtered[INTPL_POS_F] + off_dst; // f + p_dst[0] = frm->filtered10[INTPL_POS_F] + off_dst; // f p_coeffs[0] = INTPL_FILTERS[INTPL_POS_F >> 2]; // f - p_dst[1] = frm->filtered[INTPL_POS_J] + off_dst; // j + p_dst[1] = frm->filtered10[INTPL_POS_J] + off_dst; // j p_coeffs[1] = INTPL_FILTERS[INTPL_POS_J >> 2]; // j - p_dst[2] = frm->filtered[INTPL_POS_Q] + off_dst; // q + p_dst[2] = frm->filtered10[INTPL_POS_Q] + off_dst; // q p_coeffs[2] = INTPL_FILTERS[INTPL_POS_Q >> 2]; // q - g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs); + g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs); // --- for g,k,r --- - p_dst[0] = frm->filtered[INTPL_POS_G] + off_dst; // g + p_dst[0] = frm->filtered10[INTPL_POS_G] + off_dst; // g p_coeffs[0] = INTPL_FILTERS[INTPL_POS_G >> 2]; // g - p_dst[1] = frm->filtered[INTPL_POS_K] + off_dst; // k + p_dst[1] = frm->filtered10[INTPL_POS_K] + off_dst; // k p_coeffs[1] = INTPL_FILTERS[INTPL_POS_K >> 2]; // k - p_dst[2] = frm->filtered[INTPL_POS_R] + off_dst; // r + p_dst[2] = frm->filtered10[INTPL_POS_R] + off_dst; // r p_coeffs[2] = INTPL_FILTERS[INTPL_POS_R >> 2]; // r - g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs); + g_funcs.intpl_luma10_ext_x3(h, p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs); } else { // j - dst = frm->filtered[INTPL_POS_J] + off_dst; - g_funcs.intpl_luma_ext(dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]); + dst = frm->filtered10[INTPL_POS_J] + off_dst; + + g_funcs.intpl_luma10_ext(h, dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]); } /* --------------------------------------------------------------------------- @@ -734,13 +1265,14 @@ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int he /* loop over all 15 filtered planes */ for (i = 1; i < 16; i++) { - pel_t *pix = frm->filtered[i]; + pel10_t *pix = frm->filtered10[i]; if (pix != NULL) { pix += start_y * stride - PAD_OFFSET; - plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end); + plane_expand_border10(pix, stride, width, height, padh, padv, b_start, b_end); } } } + } } /* --------------------------------------------------------------------------- @@ -754,7 +1286,7 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y) int height; slice_t *slice = h->slices[h->i_slice_index]; - /* 有效插值像素区域的起始和结束行号 */ + /* 鏈夋晥鎻掑煎儚绱犲尯鍩熺殑璧峰鍜岀粨鏉熻鍙 */ if (b_start) { y_start -= PAD_OFFSET; } else { @@ -766,14 +1298,14 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y) y_end -= MC_OFFSET; } - /* 多slice时减少冗余运算 */ + /* 澶歴lice鏃跺噺灏戝啑浣欒繍绠 */ if (h->param->slice_num > 1 && !b_start && !b_end) { if (slice->i_first_lcu_y == i_lcu_y) { - /* Slice的上边界 */ + /* Slice鐨勪笂杈圭晫 */ y_start += (MC_OFFSET + PAD_OFFSET); } if (slice->i_last_lcu_y == i_lcu_y) { - /* Slice的下边界 */ + /* Slice鐨勪笅杈圭晫 */ y_end += PAD_OFFSET; } } @@ -794,15 +1326,15 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y) /* --------------------------------------------------------------------------- * predict one component of a chroma block */ -void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, +void mc_chroma8(xavs2_t *h, pel8_t *p_pred_u, pel8_t *p_pred_v, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm) { int posx = pix_quad_x & 7; int posy = pix_quad_y & 7; int i_src = p_ref_frm->i_stride[IMG_U]; - pel_t *p_src_u = p_ref_frm->planes[IMG_U]; - pel_t *p_src_v = p_ref_frm->planes[IMG_V]; + pel8_t *p_src_u = p_ref_frm->planes8[IMG_U]; + pel8_t *p_src_v = p_ref_frm->planes8[IMG_V]; int src_offset = (pix_quad_y >> 3) * i_src + (pix_quad_x >> 3); p_src_u += src_offset; @@ -810,21 +1342,55 @@ void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, if (posy == 0 && posx == 0) { if (width != 2 && width != 6 && height != 2 && height != 6) { - g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src); - g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src); + g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src); + g_funcs.pixf.copy_pp8[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src); } else { - g_funcs.align_copy(p_pred_u, i_pred, p_src_u, i_src, width, height); - g_funcs.align_copy(p_pred_v, i_pred, p_src_v, i_src, width, height); + g_funcs.align_copy8(h, p_pred_u, i_pred, p_src_u, i_src, width, height); + g_funcs.align_copy8(h, p_pred_v, i_pred, p_src_v, i_src, width, height); } } else if (posy == 0) { - g_funcs.intpl_chroma_block_hor(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]); - g_funcs.intpl_chroma_block_hor(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]); + g_funcs.intpl_chroma8_block_hor(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]); + g_funcs.intpl_chroma8_block_hor(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]); } else if (posx == 0) { - g_funcs.intpl_chroma_block_ver(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]); - g_funcs.intpl_chroma_block_ver(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma8_block_ver(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma8_block_ver(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]); } else { - g_funcs.intpl_chroma_block_ext(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); - g_funcs.intpl_chroma_block_ext(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma8_block_ext(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma8_block_ext(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); + } +} + +void mc_chroma10(xavs2_t *h, pel10_t *p_pred_u, pel10_t *p_pred_v, int i_pred, + int pix_quad_x, int pix_quad_y, int width, int height, + const xavs2_frame_t *p_ref_frm) +{ + int posx = pix_quad_x & 7; + int posy = pix_quad_y & 7; + int i_src = p_ref_frm->i_stride[IMG_U]; + pel10_t *p_src_u = p_ref_frm->planes10[IMG_U]; + pel10_t *p_src_v = p_ref_frm->planes10[IMG_V]; + int src_offset = (pix_quad_y >> 3) * i_src + (pix_quad_x >> 3); + + p_src_u += src_offset; + p_src_v += src_offset; + + if (posy == 0 && posx == 0) { + if (width != 2 && width != 6 && height != 2 && height != 6) { + g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src); + g_funcs.pixf.copy_pp10[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src); + } else { + g_funcs.align_copy10(h, p_pred_u, i_pred, p_src_u, i_src, width, height); + g_funcs.align_copy10(h, p_pred_v, i_pred, p_src_v, i_src, width, height); + } + } else if (posy == 0) { + g_funcs.intpl_chroma10_block_hor(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]); + g_funcs.intpl_chroma10_block_hor(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]); + } else if (posx == 0) { + g_funcs.intpl_chroma10_block_ver(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma10_block_ver(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]); + } else { + g_funcs.intpl_chroma10_block_ext(h, p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); + g_funcs.intpl_chroma10_block_ext(h, p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); } } @@ -838,12 +1404,32 @@ void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, /* --------------------------------------------------------------------------- */ -static void lowres_filter_core_c(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height) +static void lowres_filter_core8_c(xavs2_t *h, pel8_t *src, int i_src, pel8_t *dst, int i_dst, int width, int height) +{ +#define FILTER(a,b,c,d) ((((a+b+1)>>1) + ((c+d+1)>>1) + 1) >> 1) + + int i_src2 = i_src << 1; // stride of 2 src lines + int x, y; + pel8_t *dwn; + + for (y = 0; y < height; y++) { + dwn = src + i_src; // point to down line of src + for (x = 0; x < width; x++) { + dst[x] = FILTER(src[2 * x], dwn[2 * x], src[2 * x + 1], dwn[2 * x + 1]); + } + src += i_src2; + dst += i_dst; + } +#undef FILTER +} + +static void lowres_filter_core10_c(xavs2_t *h, pel10_t *src, int i_src, pel10_t *dst, int i_dst, int width, int height) { #define FILTER(a,b,c,d) ((((a+b+1)>>1) + ((c+d+1)>>1) + 1) >> 1) + int i_src2 = i_src << 1; // stride of 2 src lines int x, y; - pel_t *dwn; + pel10_t *dwn; for (y = 0; y < height; y++) { dwn = src + i_src; // point to down line of src @@ -865,7 +1451,7 @@ static void lowres_filter_core_c(pel_t *src, int i_src, pel_t *dst, int i_dst, i /* --------------------------------------------------------------------------- * global function set initial */ -void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf) +void xavs2_mem_oper_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf) { pf->fast_memcpy = memcpy; pf->memcpy_aligned = memcpy; @@ -874,10 +1460,14 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf) pf->memzero_aligned = memzero_aligned_c; pf->mem_repeat_i = mem_repeat_i_c; pf->mem_repeat_p = memset; - pf->lowres_filter = lowres_filter_core_c; + if (param->input_sample_bit_depth == 8) { + pf->lowres_filter8 = lowres_filter_core8_c; + } else { + pf->lowres_filter10 = lowres_filter_core10_c; + } #if ARCH_X86_64 - pf->mem_repeat_i = mem_repeat_8i_c; // x64架构下,减少循环次数同时使用64位打包赋值 + pf->mem_repeat_i = mem_repeat_8i_c; // x64鏋舵瀯涓嬶紝鍑忓皯寰幆娆℃暟鍚屾椂浣跨敤64浣嶆墦鍖呰祴鍊 #endif #if HAVE_MMX @@ -901,18 +1491,20 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf) pf->memzero_aligned = xavs2_memzero_aligned_c_sse2; // pf->memcpy_aligned = xavs2_memcpy_aligned_c_sse2; pf->lowres_filter = xavs2_lowres_filter_core_sse2; - // pf->mem_repeat_i = xavs2_mem_repeat_i_c_sse2; // TODO: 比C版本慢,禁用 + // pf->mem_repeat_i = xavs2_mem_repeat_i_c_sse2; // TODO: 姣擟鐗堟湰鎱紝绂佺敤 } if (cpuid & XAVS2_CPU_SSSE3) { pf->lowres_filter = xavs2_lowres_filter_core_ssse3; } +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { pf->memzero_aligned = xavs2_memzero_aligned_c_avx; - // pf->mem_repeat_i = xavs2_mem_repeat_i_c_avx; // TODO: 比C版本慢,禁用 + // pf->mem_repeat_i = xavs2_mem_repeat_i_c_avx; // TODO: 姣擟鐗堟湰鎱紝绂佺敤 pf->lowres_filter = xavs2_lowres_filter_core_avx; } +#endif #else UNUSED_PARAMETER(cpuid); #endif @@ -920,30 +1512,55 @@ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf) /* --------------------------------------------------------------------------- */ -void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf) +void xavs2_mc_init(xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf) { + if (param->input_sample_bit_depth == 8) { /* align copy */ - pf->align_copy = mc_copy_c; + pf->align_copy8 = mc_copy8_c; /* plane copy */ - pf->plane_copy = plane_copy_c; - pf->plane_copy_deinterleave = plane_copy_deinterleave_c; + pf->plane_copy8 = plane_copy8_c; + pf->plane_copy8_deinterleave = plane_copy8_deinterleave_c; /* interpolate */ - pf->intpl_luma_hor = intpl_luma_hor_c; - pf->intpl_luma_ver = intpl_luma_ver_c; - pf->intpl_luma_ext = intpl_luma_ext_c; + pf->intpl_luma8_hor = intpl_luma8_hor_c; + pf->intpl_luma8_ver = intpl_luma8_ver_c; + pf->intpl_luma8_ext = intpl_luma8_ext_c; + + pf->intpl_luma8_ver_x3 = intpl_luma8_ver_x3_c; + pf->intpl_luma8_hor_x3 = intpl_luma8_hor_x3_c; + pf->intpl_luma8_ext_x3 = intpl_luma8_ext_x3_c; + + pf->intpl_luma8_block_hor = intpl_luma8_block_hor_c; + pf->intpl_luma8_block_ver = intpl_luma8_block_ver_c; + pf->intpl_luma8_block_ext = intpl_luma8_block_ext_c; + pf->intpl_chroma8_block_hor = intpl_chroma8_block_hor_c; + pf->intpl_chroma8_block_ver = intpl_chroma8_block_ver_c; + pf->intpl_chroma8_block_ext = intpl_chroma8_block_ext_c; + } else { + /* align copy */ + pf->align_copy10 = mc_copy10_c; - pf->intpl_luma_ver_x3 = intpl_luma_ver_x3_c; - pf->intpl_luma_hor_x3 = intpl_luma_hor_x3_c; - pf->intpl_luma_ext_x3 = intpl_luma_ext_x3_c; + /* plane copy */ + pf->plane_copy10 = plane_copy10_c; + pf->plane_copy10_deinterleave = plane_copy10_deinterleave_c; - pf->intpl_luma_block_hor = intpl_luma_block_hor_c; - pf->intpl_luma_block_ver = intpl_luma_block_ver_c; - pf->intpl_luma_block_ext = intpl_luma_block_ext_c; - pf->intpl_chroma_block_hor = intpl_chroma_block_hor_c; - pf->intpl_chroma_block_ver = intpl_chroma_block_ver_c; - pf->intpl_chroma_block_ext = intpl_chroma_block_ext_c; + /* interpolate */ + pf->intpl_luma10_hor = intpl_luma10_hor_c; + pf->intpl_luma10_ver = intpl_luma10_ver_c; + pf->intpl_luma10_ext = intpl_luma10_ext_c; + + pf->intpl_luma10_ver_x3 = intpl_luma10_ver_x3_c; + pf->intpl_luma10_hor_x3 = intpl_luma10_hor_x3_c; + pf->intpl_luma10_ext_x3 = intpl_luma10_ext_x3_c; + + pf->intpl_luma10_block_hor = intpl_luma10_block_hor_c; + pf->intpl_luma10_block_ver = intpl_luma10_block_ver_c; + pf->intpl_luma10_block_ext = intpl_luma10_block_ext_c; + pf->intpl_chroma10_block_hor = intpl_chroma10_block_hor_c; + pf->intpl_chroma10_block_ver = intpl_chroma10_block_ver_c; + pf->intpl_chroma10_block_ext = intpl_chroma10_block_ext_c; + } #if HAVE_MMX if (cpuid & XAVS2_CPU_MMX2) { @@ -951,6 +1568,7 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf) pf->plane_copy_deinterleave = xavs2_plane_copy_deinterleave_mmx; } +#if !HIGH_BIT_DEPTH if (cpuid & XAVS2_CPU_SSE42) { pf->intpl_luma_hor = intpl_luma_hor_sse128; pf->intpl_luma_ver = intpl_luma_ver_sse128; @@ -967,7 +1585,9 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf) pf->intpl_chroma_block_ver = intpl_chroma_block_ver_sse128; pf->intpl_chroma_block_ext = intpl_chroma_block_ext_sse128; } +#endif +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { pf->intpl_luma_hor = intpl_luma_hor_avx2; pf->intpl_luma_ver = intpl_luma_ver_avx2; @@ -985,6 +1605,7 @@ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf) pf->intpl_chroma_block_hor = intpl_chroma_block_hor_avx2; pf->intpl_chroma_block_ext = intpl_chroma_block_ext_avx2; } +#endif #else UNUSED_PARAMETER(cpuid); #endif diff --git a/source/common/mc.h b/source/common/mc.h index 6df7db4..e635464 100644 --- a/source/common/mc.h +++ b/source/common/mc.h @@ -44,16 +44,16 @@ */ /* --------------------------------------------------------------------------- - * img_size: 整像素精度的图像 宽度或高度 (整像素精度) - * blk_size: 当前预测块的 宽度或高度 (整像素精度) - * blk_pos: 当前块在图像中的 x/y 坐标 (整像素精度) - * mv : MV 的 x/y 分量 (1/4像素精度) + * img_size: 鏁村儚绱犵簿搴︾殑鍥惧儚 瀹藉害鎴栭珮搴 锛堟暣鍍忕礌绮惧害锛 + * blk_size: 褰撳墠棰勬祴鍧楃殑 瀹藉害鎴栭珮搴 锛堟暣鍍忕礌绮惧害锛 + * blk_pos: 褰撳墠鍧楀湪鍥惧儚涓殑 x/y 鍧愭爣 锛堟暣鍍忕礌绮惧害锛 + * mv : MV 鐨 x/y 鍒嗛噺 锛1/4鍍忕礌绮惧害锛 */ static INLINE int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv) { - int imv = mv >> 2; // MV的整像素精度 - int fmv = mv & 7; // MV的分像素精度部分,保留到 1/8 精度 + int imv = mv >> 2; // MV鐨勬暣鍍忕礌绮惧害 + int fmv = mv & 7; // MV鐨勫垎鍍忕礌绮惧害閮ㄥ垎锛屼繚鐣欏埌 1/8 绮惧害 if (blk_pos + imv < -blk_size - 8) { return ((-blk_size - 8) << 2) + (fmv); @@ -69,7 +69,7 @@ int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv) static ALWAYS_INLINE void get_mv_for_mc(xavs2_t *h, mv_t *mv, int pic_pix_x, int pic_pix_y, int blk_w, int blk_h) { - // WARNING: 在图像分辨率为 4K 及以下时,精度足够;8K 时不够用 + // WARNING: 鍦ㄥ浘鍍忓垎杈ㄧ巼涓 4K 鍙婁互涓嬫椂锛岀簿搴﹁冻澶燂紱8K 鏃朵笉澶熺敤 mv->x = (int16_t)cu_get_mc_pos(h->i_width, blk_w, pic_pix_x, mv->x); mv->y = (int16_t)cu_get_mc_pos(h->i_height, blk_h, pic_pix_y, mv->y); } @@ -85,13 +85,21 @@ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y); #define interpolate_sample_rows FPFX(interpolate_sample_rows) void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int height, int b_start, int b_end); -#define mc_luma FPFX(mc_luma) -void mc_luma (pel_t *p_pred, int i_pred, +#define mc_luma8 FPFX(mc_luma8) +void mc_luma8 (xavs2_t *h, pel8_t *p_pred, int i_pred, + int pic_pix_x, int pic_pix_y, int width, int height, + const xavs2_frame_t *p_ref_frm); +#define mc_luma10 FPFX(mc_luma10) +void mc_luma10 (xavs2_t *h, pel10_t *p_pred, int i_pred, int pic_pix_x, int pic_pix_y, int width, int height, const xavs2_frame_t *p_ref_frm); -#define mc_chroma FPFX(mc_chroma) -void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, +#define mc_chroma8 FPFX(mc_chroma8) +void mc_chroma8(xavs2_t *h, pel8_t *p_pred_u, pel8_t *p_pred_v, int i_pred, + int pix_quad_x, int pix_quad_y, int width, int height, + const xavs2_frame_t *p_ref_frm); +#define mc_chroma10 FPFX(mc_chroma10) +void mc_chroma10(xavs2_t *h, pel10_t *p_pred_u, pel10_t *p_pred_v, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm); diff --git a/source/common/osdep.h b/source/common/osdep.h index 51a90f7..27ad05b 100644 --- a/source/common/osdep.h +++ b/source/common/osdep.h @@ -169,7 +169,7 @@ # define ALIGN_256_PTR(p) (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_256B - 1)) & (~(intptr_t)(CACHE_LINE_256B - 1))) #if defined(_MSC_VER) -#pragma warning(disable:4324) /* disable warning C4324: 由于 __declspec(align()),结构被填充 */ +#pragma warning(disable:4324) /* disable warning C4324: 鐢变簬 __declspec(align())锛岀粨鏋勮濉厖 */ #define DECLARE_ALIGNED(var, n) __declspec(align(n)) var #else #define DECLARE_ALIGNED(var, n) var __attribute__((aligned(n))) @@ -216,7 +216,7 @@ #define ALIGNED_ARRAY_64(...) EXPAND(ALIGNED_ARRAY_EMU(63, __VA_ARGS__)) /* For AVX2 */ -#if ARCH_X86 || ARCH_X86_64 +#if defined(__AVX2__) && (ARCH_X86 || ARCH_X86_64) #define NATIVE_ALIGN 32 #define ALIGNED_N ALIGN32 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 diff --git a/source/common/pixel.c b/source/common/pixel.c index cf6961d..e64ed04 100644 --- a/source/common/pixel.c +++ b/source/common/pixel.c @@ -85,13 +85,14 @@ const uint8_t g_partition_map_tab[] = { * =========================================================================== */ +//#if !HIGH_BIT_DEPTH /** * --------------------------------------------------------------------------- * SAD * --------------------------------------------------------------------------- */ -#define PIXEL_SAD_C(w, h) \ -static cmp_dist_t xavs2_pixel_sad_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SAD8_C(w, h) \ +static cmp_dist_t xavs2_pixel_sad8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sum = 0;\ int x, y;\ @@ -108,31 +109,75 @@ static cmp_dist_t xavs2_pixel_sad_##w##x##h(const pel_t *pix1, intptr_t i_pix1, return sum;\ } -PIXEL_SAD_C(64, 64) /* 64x64 */ -PIXEL_SAD_C(64, 32) -PIXEL_SAD_C(32, 64) -PIXEL_SAD_C(64, 16) -PIXEL_SAD_C(64, 48) -PIXEL_SAD_C(16, 64) -PIXEL_SAD_C(48, 64) -PIXEL_SAD_C(32, 32) /* 32x32 */ -PIXEL_SAD_C(32, 16) -PIXEL_SAD_C(16, 32) -PIXEL_SAD_C(32, 8) -PIXEL_SAD_C(32, 24) -PIXEL_SAD_C( 8, 32) -PIXEL_SAD_C(24, 32) -PIXEL_SAD_C(16, 16) /* 16x16 */ -PIXEL_SAD_C(16, 8) -PIXEL_SAD_C( 8, 16) -PIXEL_SAD_C(16, 4) -PIXEL_SAD_C(16, 12) -PIXEL_SAD_C( 4, 16) -PIXEL_SAD_C(12, 16) -PIXEL_SAD_C( 8, 8) /* 8x8 */ -PIXEL_SAD_C( 8, 4) -PIXEL_SAD_C( 4, 8) -PIXEL_SAD_C( 4, 4) /* 4x4 */ +PIXEL_SAD8_C(64, 64) /* 64x64 */ +PIXEL_SAD8_C(64, 32) +PIXEL_SAD8_C(32, 64) +PIXEL_SAD8_C(64, 16) +PIXEL_SAD8_C(64, 48) +PIXEL_SAD8_C(16, 64) +PIXEL_SAD8_C(48, 64) +PIXEL_SAD8_C(32, 32) /* 32x32 */ +PIXEL_SAD8_C(32, 16) +PIXEL_SAD8_C(16, 32) +PIXEL_SAD8_C(32, 8) +PIXEL_SAD8_C(32, 24) +PIXEL_SAD8_C( 8, 32) +PIXEL_SAD8_C(24, 32) +PIXEL_SAD8_C(16, 16) /* 16x16 */ +PIXEL_SAD8_C(16, 8) +PIXEL_SAD8_C( 8, 16) +PIXEL_SAD8_C(16, 4) +PIXEL_SAD8_C(16, 12) +PIXEL_SAD8_C( 4, 16) +PIXEL_SAD8_C(12, 16) +PIXEL_SAD8_C( 8, 8) /* 8x8 */ +PIXEL_SAD8_C( 8, 4) +PIXEL_SAD8_C( 4, 8) +PIXEL_SAD8_C( 4, 4) /* 4x4 */ + +#define PIXEL_SAD10_C(w, h) \ +static cmp_dist_t xavs2_pixel_sad10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ +{\ + cmp_dist_t sum = 0;\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x += 4) {\ + sum += abs(pix1[x] - pix2[x]);\ + sum += abs(pix1[x + 1] - pix2[x + 1]);\ + sum += abs(pix1[x + 2] - pix2[x + 2]);\ + sum += abs(pix1[x + 3] - pix2[x + 3]);\ + }\ + pix1 += i_pix1;\ + pix2 += i_pix2;\ + }\ + return sum;\ +} + +PIXEL_SAD10_C(64, 64) /* 64x64 */ +PIXEL_SAD10_C(64, 32) +PIXEL_SAD10_C(32, 64) +PIXEL_SAD10_C(64, 16) +PIXEL_SAD10_C(64, 48) +PIXEL_SAD10_C(16, 64) +PIXEL_SAD10_C(48, 64) +PIXEL_SAD10_C(32, 32) /* 32x32 */ +PIXEL_SAD10_C(32, 16) +PIXEL_SAD10_C(16, 32) +PIXEL_SAD10_C(32, 8) +PIXEL_SAD10_C(32, 24) +PIXEL_SAD10_C( 8, 32) +PIXEL_SAD10_C(24, 32) +PIXEL_SAD10_C(16, 16) /* 16x16 */ +PIXEL_SAD10_C(16, 8) +PIXEL_SAD10_C( 8, 16) +PIXEL_SAD10_C(16, 4) +PIXEL_SAD10_C(16, 12) +PIXEL_SAD10_C( 4, 16) +PIXEL_SAD10_C(12, 16) +PIXEL_SAD10_C( 8, 8) /* 8x8 */ +PIXEL_SAD10_C( 8, 4) +PIXEL_SAD10_C( 4, 8) +PIXEL_SAD10_C( 4, 4) /* 4x4 */ /** @@ -140,8 +185,8 @@ PIXEL_SAD_C( 4, 4) /* 4x4 */ * SAD x3 * --------------------------------------------------------------------------- */ -#define PIXEL_SAD_X3_C(w, h) \ -void xavs2_pixel_sad_x3_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, intptr_t i_fref_stride, int32_t* res)\ +#define PIXEL_SAD8_X3_C(w, h) \ +void xavs2_pixel_sad8_x3_##w##x##h(const pel8_t* pix1, const pel8_t* pix2, const pel8_t* pix3, const pel8_t* pix4, intptr_t i_fref_stride, int32_t* res)\ {\ int x, y;\ res[0] = 0;\ @@ -160,31 +205,77 @@ void xavs2_pixel_sad_x3_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pe }\ } -PIXEL_SAD_X3_C(64, 64) /* 64x64 */ -PIXEL_SAD_X3_C(64, 32) -PIXEL_SAD_X3_C(32, 64) -PIXEL_SAD_X3_C(64, 16) -PIXEL_SAD_X3_C(64, 48) -PIXEL_SAD_X3_C(16, 64) -PIXEL_SAD_X3_C(48, 64) -PIXEL_SAD_X3_C(32, 32) /* 32x32 */ -PIXEL_SAD_X3_C(32, 16) -PIXEL_SAD_X3_C(16, 32) -PIXEL_SAD_X3_C(32, 8) -PIXEL_SAD_X3_C(32, 24) -PIXEL_SAD_X3_C( 8, 32) -PIXEL_SAD_X3_C(24, 32) -PIXEL_SAD_X3_C(16, 16) /* 16x16 */ -PIXEL_SAD_X3_C(16, 8) -PIXEL_SAD_X3_C( 8, 16) -PIXEL_SAD_X3_C(16, 4) -PIXEL_SAD_X3_C(16, 12) -PIXEL_SAD_X3_C( 4, 16) -PIXEL_SAD_X3_C(12, 16) -PIXEL_SAD_X3_C( 8, 8) /* 8x8 */ -PIXEL_SAD_X3_C( 8, 4) -PIXEL_SAD_X3_C( 4, 8) -PIXEL_SAD_X3_C( 4, 4) /* 4x4 */ +PIXEL_SAD8_X3_C(64, 64) /* 64x64 */ +PIXEL_SAD8_X3_C(64, 32) +PIXEL_SAD8_X3_C(32, 64) +PIXEL_SAD8_X3_C(64, 16) +PIXEL_SAD8_X3_C(64, 48) +PIXEL_SAD8_X3_C(16, 64) +PIXEL_SAD8_X3_C(48, 64) +PIXEL_SAD8_X3_C(32, 32) /* 32x32 */ +PIXEL_SAD8_X3_C(32, 16) +PIXEL_SAD8_X3_C(16, 32) +PIXEL_SAD8_X3_C(32, 8) +PIXEL_SAD8_X3_C(32, 24) +PIXEL_SAD8_X3_C( 8, 32) +PIXEL_SAD8_X3_C(24, 32) +PIXEL_SAD8_X3_C(16, 16) /* 16x16 */ +PIXEL_SAD8_X3_C(16, 8) +PIXEL_SAD8_X3_C( 8, 16) +PIXEL_SAD8_X3_C(16, 4) +PIXEL_SAD8_X3_C(16, 12) +PIXEL_SAD8_X3_C( 4, 16) +PIXEL_SAD8_X3_C(12, 16) +PIXEL_SAD8_X3_C( 8, 8) /* 8x8 */ +PIXEL_SAD8_X3_C( 8, 4) +PIXEL_SAD8_X3_C( 4, 8) +PIXEL_SAD8_X3_C( 4, 4) /* 4x4 */ + +#define PIXEL_SAD10_X3_C(w, h) \ +void xavs2_pixel_sad10_x3_##w##x##h(const pel10_t* pix1, const pel10_t* pix2, const pel10_t* pix3, const pel10_t* pix4, intptr_t i_fref_stride, int32_t* res)\ +{\ + int x, y;\ + res[0] = 0;\ + res[1] = 0;\ + res[2] = 0;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + res[0] += abs(pix1[x] - pix2[x]);\ + res[1] += abs(pix1[x] - pix3[x]);\ + res[2] += abs(pix1[x] - pix4[x]);\ + }\ + pix1 += FENC_STRIDE;\ + pix2 += i_fref_stride;\ + pix3 += i_fref_stride;\ + pix4 += i_fref_stride;\ + }\ +} + +PIXEL_SAD10_X3_C(64, 64) /* 64x64 */ +PIXEL_SAD10_X3_C(64, 32) +PIXEL_SAD10_X3_C(32, 64) +PIXEL_SAD10_X3_C(64, 16) +PIXEL_SAD10_X3_C(64, 48) +PIXEL_SAD10_X3_C(16, 64) +PIXEL_SAD10_X3_C(48, 64) +PIXEL_SAD10_X3_C(32, 32) /* 32x32 */ +PIXEL_SAD10_X3_C(32, 16) +PIXEL_SAD10_X3_C(16, 32) +PIXEL_SAD10_X3_C(32, 8) +PIXEL_SAD10_X3_C(32, 24) +PIXEL_SAD10_X3_C( 8, 32) +PIXEL_SAD10_X3_C(24, 32) +PIXEL_SAD10_X3_C(16, 16) /* 16x16 */ +PIXEL_SAD10_X3_C(16, 8) +PIXEL_SAD10_X3_C( 8, 16) +PIXEL_SAD10_X3_C(16, 4) +PIXEL_SAD10_X3_C(16, 12) +PIXEL_SAD10_X3_C( 4, 16) +PIXEL_SAD10_X3_C(12, 16) +PIXEL_SAD10_X3_C( 8, 8) /* 8x8 */ +PIXEL_SAD10_X3_C( 8, 4) +PIXEL_SAD10_X3_C( 4, 8) +PIXEL_SAD10_X3_C( 4, 4) /* 4x4 */ /** @@ -193,8 +284,57 @@ PIXEL_SAD_X3_C( 4, 4) /* 4x4 */ * --------------------------------------------------------------------------- */ -#define PIXEL_SAD_X4_C(w, h) \ -void xavs2_pixel_sad_x4_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, const pel_t* pix5, intptr_t i_fref_stride, int32_t* res)\ +#define PIXEL_SAD8_X4_C(w, h) \ +void xavs2_pixel_sad8_x4_##w##x##h(const pel8_t* pix1, const pel8_t* pix2, const pel8_t* pix3, const pel8_t* pix4, const pel8_t* pix5, intptr_t i_fref_stride, int32_t* res)\ +{\ + int x, y;\ + res[0] = 0;\ + res[1] = 0;\ + res[2] = 0;\ + res[3] = 0;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + res[0] += abs(pix1[x] - pix2[x]);\ + res[1] += abs(pix1[x] - pix3[x]);\ + res[2] += abs(pix1[x] - pix4[x]);\ + res[3] += abs(pix1[x] - pix5[x]);\ + }\ + pix1 += FENC_STRIDE;\ + pix2 += i_fref_stride;\ + pix3 += i_fref_stride;\ + pix4 += i_fref_stride;\ + pix5 += i_fref_stride;\ + }\ +} + +PIXEL_SAD8_X4_C(64, 64) /* 64x64 */ +PIXEL_SAD8_X4_C(64, 32) +PIXEL_SAD8_X4_C(32, 64) +PIXEL_SAD8_X4_C(64, 16) +PIXEL_SAD8_X4_C(64, 48) +PIXEL_SAD8_X4_C(16, 64) +PIXEL_SAD8_X4_C(48, 64) +PIXEL_SAD8_X4_C(32, 32) /* 32x32 */ +PIXEL_SAD8_X4_C(32, 16) +PIXEL_SAD8_X4_C(16, 32) +PIXEL_SAD8_X4_C(32, 8) +PIXEL_SAD8_X4_C(32, 24) +PIXEL_SAD8_X4_C( 8, 32) +PIXEL_SAD8_X4_C(24, 32) +PIXEL_SAD8_X4_C(16, 16) /* 16x16 */ +PIXEL_SAD8_X4_C(16, 8) +PIXEL_SAD8_X4_C( 8, 16) +PIXEL_SAD8_X4_C(16, 4) +PIXEL_SAD8_X4_C(16, 12) +PIXEL_SAD8_X4_C( 4, 16) +PIXEL_SAD8_X4_C(12, 16) +PIXEL_SAD8_X4_C( 8, 8) /* 8x8 */ +PIXEL_SAD8_X4_C( 8, 4) +PIXEL_SAD8_X4_C( 4, 8) +PIXEL_SAD8_X4_C( 4, 4) /* 4x4 */ + +#define PIXEL_SAD10_X4_C(w, h) \ +void xavs2_pixel_sad10_x4_##w##x##h(const pel10_t* pix1, const pel10_t* pix2, const pel10_t* pix3, const pel10_t* pix4, const pel10_t* pix5, intptr_t i_fref_stride, int32_t* res)\ {\ int x, y;\ res[0] = 0;\ @@ -216,31 +356,32 @@ void xavs2_pixel_sad_x4_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pe }\ } -PIXEL_SAD_X4_C(64, 64) /* 64x64 */ -PIXEL_SAD_X4_C(64, 32) -PIXEL_SAD_X4_C(32, 64) -PIXEL_SAD_X4_C(64, 16) -PIXEL_SAD_X4_C(64, 48) -PIXEL_SAD_X4_C(16, 64) -PIXEL_SAD_X4_C(48, 64) -PIXEL_SAD_X4_C(32, 32) /* 32x32 */ -PIXEL_SAD_X4_C(32, 16) -PIXEL_SAD_X4_C(16, 32) -PIXEL_SAD_X4_C(32, 8) -PIXEL_SAD_X4_C(32, 24) -PIXEL_SAD_X4_C( 8, 32) -PIXEL_SAD_X4_C(24, 32) -PIXEL_SAD_X4_C(16, 16) /* 16x16 */ -PIXEL_SAD_X4_C(16, 8) -PIXEL_SAD_X4_C( 8, 16) -PIXEL_SAD_X4_C(16, 4) -PIXEL_SAD_X4_C(16, 12) -PIXEL_SAD_X4_C( 4, 16) -PIXEL_SAD_X4_C(12, 16) -PIXEL_SAD_X4_C( 8, 8) /* 8x8 */ -PIXEL_SAD_X4_C( 8, 4) -PIXEL_SAD_X4_C( 4, 8) -PIXEL_SAD_X4_C( 4, 4) /* 4x4 */ +PIXEL_SAD10_X4_C(64, 64) /* 64x64 */ +PIXEL_SAD10_X4_C(64, 32) +PIXEL_SAD10_X4_C(32, 64) +PIXEL_SAD10_X4_C(64, 16) +PIXEL_SAD10_X4_C(64, 48) +PIXEL_SAD10_X4_C(16, 64) +PIXEL_SAD10_X4_C(48, 64) +PIXEL_SAD10_X4_C(32, 32) /* 32x32 */ +PIXEL_SAD10_X4_C(32, 16) +PIXEL_SAD10_X4_C(16, 32) +PIXEL_SAD10_X4_C(32, 8) +PIXEL_SAD10_X4_C(32, 24) +PIXEL_SAD10_X4_C( 8, 32) +PIXEL_SAD10_X4_C(24, 32) +PIXEL_SAD10_X4_C(16, 16) /* 16x16 */ +PIXEL_SAD10_X4_C(16, 8) +PIXEL_SAD10_X4_C( 8, 16) +PIXEL_SAD10_X4_C(16, 4) +PIXEL_SAD10_X4_C(16, 12) +PIXEL_SAD10_X4_C( 4, 16) +PIXEL_SAD10_X4_C(12, 16) +PIXEL_SAD10_X4_C( 8, 8) /* 8x8 */ +PIXEL_SAD10_X4_C( 8, 4) +PIXEL_SAD10_X4_C( 4, 8) +PIXEL_SAD10_X4_C( 4, 4) /* 4x4 */ +//#endif /** @@ -294,7 +435,34 @@ ALWAYS_INLINE uint64_t abs2_10bit(uint64_t a) /* --------------------------------------------------------------------------- */ -static cmp_dist_t xavs2_pixel_satd_4x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2) +static cmp_dist_t xavs2_pixel_satd8_4x4(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2) +{ + uint32_t tmp[4][2]; + uint32_t a0, a1, a2, a3, b0, b1; + cmp_dist_t sum = 0; + int i; + + for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + tmp[i][0] = b0 + b1; + tmp[i][1] = b0 - b1; + } + + for (i = 0; i < 2; i++) { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + sum += ((uint16_t)a0) + (a0 >> BITS_PER_SUM); + } + + return (sum >> 1); +} + +static cmp_dist_t xavs2_pixel_satd10_4x4(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2) { uint32_t tmp[4][2]; uint32_t a0, a1, a2, a3, b0, b1; @@ -324,7 +492,30 @@ static cmp_dist_t xavs2_pixel_satd_4x4(const pel_t *pix1, intptr_t i_pix1, const /* --------------------------------------------------------------------------- * SWAR version of satd 8x4, performs two 4x4 SATDs at once */ -static cmp_dist_t xavs2_pixel_satd_8x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2) +static cmp_dist_t xavs2_pixel_satd8_8x4(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2) +{ + uint32_t tmp[4][4]; + uint32_t a0, a1, a2, a3; + cmp_dist_t sum = 0; + int i; + + for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) { + a0 = (pix1[0] - pix2[0]) + ((uint32_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); + a1 = (pix1[1] - pix2[1]) + ((uint32_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); + a2 = (pix1[2] - pix2[2]) + ((uint32_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); + a3 = (pix1[3] - pix2[3]) + ((uint32_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + + for (i = 0; i < 4; i++) { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + } + + return (((uint16_t)sum) + (sum >> BITS_PER_SUM)) >> 1; +} + +static cmp_dist_t xavs2_pixel_satd10_8x4(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2) { uint32_t tmp[4][4]; uint32_t a0, a1, a2, a3; @@ -351,14 +542,28 @@ static cmp_dist_t xavs2_pixel_satd_8x4(const pel_t *pix1, intptr_t i_pix1, const /* --------------------------------------------------------------------------- * calculate satd in blocks of 4x4 */ -#define PIXEL_SATD4_C(w, h) \ -static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SATD8_4_C(w, h) \ +static cmp_dist_t xavs2_pixel_satd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ +{\ + cmp_dist_t satd = 0;\ + int y, x;\ + for (y = 0; y < h; y += 4) {\ + for (x = 0; x < w; x += 4) {\ + satd += xavs2_pixel_satd8_4x4(pix1 + y * i_pix1 + x, i_pix1,\ + pix2 + y * i_pix2 + x, i_pix2);\ + }\ + }\ + return satd;\ +} + +#define PIXEL_SATD10_4_C(w, h) \ +static cmp_dist_t xavs2_pixel_satd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t satd = 0;\ int y, x;\ for (y = 0; y < h; y += 4) {\ for (x = 0; x < w; x += 4) {\ - satd += xavs2_pixel_satd_4x4(pix1 + y * i_pix1 + x, i_pix1,\ + satd += xavs2_pixel_satd10_4x4(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ @@ -368,43 +573,81 @@ static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, /* --------------------------------------------------------------------------- * calculate satd in blocks of 8x4 */ -#define PIXEL_SATD8_C(w, h) \ -static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SATD8_8_C(w, h) \ +static cmp_dist_t xavs2_pixel_satd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t satd = 0;\ int y, x;\ for (y = 0; y < h; y += 4) {\ for (x = 0; x < w; x += 8) {\ - satd += xavs2_pixel_satd_8x4(pix1 + y * i_pix1 + x, i_pix1,\ + satd += xavs2_pixel_satd8_8x4(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return satd;\ } -PIXEL_SATD8_C(64, 64) /* 64x64 */ -PIXEL_SATD8_C(64, 32) -PIXEL_SATD8_C(32, 64) -PIXEL_SATD8_C(64, 16) -PIXEL_SATD8_C(64, 48) -PIXEL_SATD8_C(16, 64) -PIXEL_SATD8_C(48, 64) -PIXEL_SATD8_C(32, 32) /* 32x32 */ -PIXEL_SATD8_C(32, 16) -PIXEL_SATD8_C(16, 32) -PIXEL_SATD8_C(32, 8) -PIXEL_SATD8_C(32, 24) -PIXEL_SATD8_C( 8, 32) -PIXEL_SATD8_C(24, 32) -PIXEL_SATD8_C(16, 16) /* 16x16 */ -PIXEL_SATD8_C(16, 8) -PIXEL_SATD8_C( 8, 16) -PIXEL_SATD8_C(16, 4) -PIXEL_SATD8_C(16, 12) -PIXEL_SATD4_C( 4, 16) -PIXEL_SATD4_C(12, 16) -PIXEL_SATD8_C( 8, 8) /* 8x8 */ -PIXEL_SATD4_C( 4, 8) +PIXEL_SATD8_8_C(64, 64) /* 64x64 */ +PIXEL_SATD8_8_C(64, 32) +PIXEL_SATD8_8_C(32, 64) +PIXEL_SATD8_8_C(64, 16) +PIXEL_SATD8_8_C(64, 48) +PIXEL_SATD8_8_C(16, 64) +PIXEL_SATD8_8_C(48, 64) +PIXEL_SATD8_8_C(32, 32) /* 32x32 */ +PIXEL_SATD8_8_C(32, 16) +PIXEL_SATD8_8_C(16, 32) +PIXEL_SATD8_8_C(32, 8) +PIXEL_SATD8_8_C(32, 24) +PIXEL_SATD8_8_C( 8, 32) +PIXEL_SATD8_8_C(24, 32) +PIXEL_SATD8_8_C(16, 16) /* 16x16 */ +PIXEL_SATD8_8_C(16, 8) +PIXEL_SATD8_8_C( 8, 16) +PIXEL_SATD8_8_C(16, 4) +PIXEL_SATD8_8_C(16, 12) +PIXEL_SATD8_4_C( 4, 16) +PIXEL_SATD8_4_C(12, 16) +PIXEL_SATD8_8_C( 8, 8) /* 8x8 */ +PIXEL_SATD8_4_C( 4, 8) + +#define PIXEL_SATD10_8_C(w, h) \ +static cmp_dist_t xavs2_pixel_satd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ +{\ + cmp_dist_t satd = 0;\ + int y, x;\ + for (y = 0; y < h; y += 4) {\ + for (x = 0; x < w; x += 8) {\ + satd += xavs2_pixel_satd10_8x4(pix1 + y * i_pix1 + x, i_pix1,\ + pix2 + y * i_pix2 + x, i_pix2);\ + }\ + }\ + return satd;\ +} + +PIXEL_SATD10_8_C(64, 64) /* 64x64 */ +PIXEL_SATD10_8_C(64, 32) +PIXEL_SATD10_8_C(32, 64) +PIXEL_SATD10_8_C(64, 16) +PIXEL_SATD10_8_C(64, 48) +PIXEL_SATD10_8_C(16, 64) +PIXEL_SATD10_8_C(48, 64) +PIXEL_SATD10_8_C(32, 32) /* 32x32 */ +PIXEL_SATD10_8_C(32, 16) +PIXEL_SATD10_8_C(16, 32) +PIXEL_SATD10_8_C(32, 8) +PIXEL_SATD10_8_C(32, 24) +PIXEL_SATD10_8_C( 8, 32) +PIXEL_SATD10_8_C(24, 32) +PIXEL_SATD10_8_C(16, 16) /* 16x16 */ +PIXEL_SATD10_8_C(16, 8) +PIXEL_SATD10_8_C( 8, 16) +PIXEL_SATD10_8_C(16, 4) +PIXEL_SATD10_8_C(16, 12) +PIXEL_SATD10_4_C( 4, 16) +PIXEL_SATD10_4_C(12, 16) +PIXEL_SATD10_8_C( 8, 8) /* 8x8 */ +PIXEL_SATD10_4_C( 4, 8) /** @@ -413,7 +656,42 @@ PIXEL_SATD4_C( 4, 8) * --------------------------------------------------------------------------- */ -int _sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) +int _sa8d8_8x8(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2) +{ + sum2_t tmp[8][4]; + sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; + sum2_t sum = 0; + + for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + a4 = pix1[4] - pix2[4]; + a5 = pix1[5] - pix2[5]; + b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); + a6 = pix1[6] - pix2[6]; + a7 = pix1[7] - pix2[7]; + b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); + } + + for (int i = 0; i < 4; i++) { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); + b0 = abs2(a0 + a4) + abs2(a0 - a4); + b0 += abs2(a1 + a5) + abs2(a1 - a5); + b0 += abs2(a2 + a6) + abs2(a2 - a6); + b0 += abs2(a3 + a7) + abs2(a3 - a7); + sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); + } + + return (cmp_dist_t)sum; +} + +int _sa8d10_8x8(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -451,20 +729,39 @@ int _sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_ /* --------------------------------------------------------------------------- */ static -cmp_dist_t xavs2_pixel_sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) +cmp_dist_t xavs2_pixel_sa8d8_8x8(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2) { - return (cmp_dist_t)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); + return (cmp_dist_t)((_sa8d8_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); +} + +static +cmp_dist_t xavs2_pixel_sa8d10_8x8(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2) +{ + return (cmp_dist_t)((_sa8d10_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } /* --------------------------------------------------------------------------- */ static -cmp_dist_t xavs2_pixel_sa8d_16x16(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) +cmp_dist_t xavs2_pixel_sa8d8_16x16(const pel8_t* pix1, intptr_t i_pix1, const pel8_t* pix2, intptr_t i_pix2) +{ + cmp_dist_t sum = _sa8d8_8x8(pix1, i_pix1, pix2, i_pix2) + + _sa8d8_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) + + _sa8d8_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) + + _sa8d8_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); + + // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because + // this version only rounds once at the end + return (sum + 2) >> 2; +} + +static +cmp_dist_t xavs2_pixel_sa8d10_16x16(const pel10_t* pix1, intptr_t i_pix1, const pel10_t* pix2, intptr_t i_pix2) { - cmp_dist_t sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) - + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) - + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) - + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); + cmp_dist_t sum = _sa8d10_8x8(pix1, i_pix1, pix2, i_pix2) + + _sa8d10_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) + + _sa8d10_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) + + _sa8d10_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because // this version only rounds once at the end @@ -474,14 +771,28 @@ cmp_dist_t xavs2_pixel_sa8d_16x16(const pel_t* pix1, intptr_t i_pix1, const pel_ /* --------------------------------------------------------------------------- * calculate sa8d in blocks of 8x8 */ -#define PIXEL_SA8D_C8(w, h) \ -static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SA8D8_C8(w, h) \ +static cmp_dist_t xavs2_pixel_sa8d8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sa8d = 0;\ int y, x;\ for (y = 0; y < h; y += 8) {\ for (x = 0; x < w; x += 8) {\ - sa8d += xavs2_pixel_sa8d_8x8(pix1 + y * i_pix1 + x, i_pix1,\ + sa8d += xavs2_pixel_sa8d8_8x8(pix1 + y * i_pix1 + x, i_pix1,\ + pix2 + y * i_pix2 + x, i_pix2);\ + }\ + }\ + return sa8d;\ +} + +#define PIXEL_SA8D10_C8(w, h) \ +static cmp_dist_t xavs2_pixel_sa8d10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ +{\ + cmp_dist_t sa8d = 0;\ + int y, x;\ + for (y = 0; y < h; y += 8) {\ + for (x = 0; x < w; x += 8) {\ + sa8d += xavs2_pixel_sa8d10_8x8(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ @@ -491,50 +802,104 @@ static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, /* --------------------------------------------------------------------------- * calculate sa8d in blocks of 16x16 */ -#define PIXEL_SA8D_C16(w, h) \ -static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SA8D8_C16(w, h) \ +static cmp_dist_t xavs2_pixel_sa8d8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sa8d = 0;\ int y, x;\ for (y = 0; y < h; y += 16) {\ for (x = 0; x < w; x += 16) {\ - sa8d += xavs2_pixel_sa8d_16x16(pix1 + y * i_pix1 + x, i_pix1,\ + sa8d += xavs2_pixel_sa8d8_16x16(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return sa8d;\ } -#define xavs2_pixel_sa8d_4x4 xavs2_pixel_satd_4x4 -#define xavs2_pixel_sa8d_4x8 xavs2_pixel_satd_4x8 -#define xavs2_pixel_sa8d_8x4 xavs2_pixel_satd_8x4 -#define xavs2_pixel_sa8d_16x4 xavs2_pixel_satd_16x4 -#define xavs2_pixel_sa8d_4x16 xavs2_pixel_satd_4x16 -#define xavs2_pixel_sa8d_12x16 xavs2_pixel_satd_12x16 -#define xavs2_pixel_sa8d_16x12 xavs2_pixel_satd_16x12 -PIXEL_SA8D_C8(8, 16) -PIXEL_SA8D_C8(8, 32) -PIXEL_SA8D_C8(16, 8) -PIXEL_SA8D_C8(32, 8) -PIXEL_SA8D_C16(32, 16) -PIXEL_SA8D_C8(32, 24) -PIXEL_SA8D_C8(24, 32) -PIXEL_SA8D_C16(32, 32) -PIXEL_SA8D_C16(16, 32) -PIXEL_SA8D_C16(64, 16) -PIXEL_SA8D_C16(64, 32) -PIXEL_SA8D_C16(64, 48) -PIXEL_SA8D_C16(16, 64) -PIXEL_SA8D_C16(32, 64) -PIXEL_SA8D_C16(48, 64) -PIXEL_SA8D_C16(64, 64) +#define xavs2_pixel_sa8d8_4x4 xavs2_pixel_satd8_4x4 +#define xavs2_pixel_sa8d8_4x8 xavs2_pixel_satd8_4x8 +#define xavs2_pixel_sa8d8_8x4 xavs2_pixel_satd8_8x4 +#define xavs2_pixel_sa8d8_16x4 xavs2_pixel_satd8_16x4 +#define xavs2_pixel_sa8d8_4x16 xavs2_pixel_satd8_4x16 +#define xavs2_pixel_sa8d8_12x16 xavs2_pixel_satd8_12x16 +#define xavs2_pixel_sa8d8_16x12 xavs2_pixel_satd8_16x12 +PIXEL_SA8D8_C8(8, 16) +PIXEL_SA8D8_C8(8, 32) +PIXEL_SA8D8_C8(16, 8) +PIXEL_SA8D8_C8(32, 8) +PIXEL_SA8D8_C16(32, 16) +PIXEL_SA8D8_C8(32, 24) +PIXEL_SA8D8_C8(24, 32) +PIXEL_SA8D8_C16(32, 32) +PIXEL_SA8D8_C16(16, 32) +PIXEL_SA8D8_C16(64, 16) +PIXEL_SA8D8_C16(64, 32) +PIXEL_SA8D8_C16(64, 48) +PIXEL_SA8D8_C16(16, 64) +PIXEL_SA8D8_C16(32, 64) +PIXEL_SA8D8_C16(48, 64) +PIXEL_SA8D8_C16(64, 64) + +#define PIXEL_SA8D10_C16(w, h) \ +static cmp_dist_t xavs2_pixel_sa8d10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ +{\ + cmp_dist_t sa8d = 0;\ + int y, x;\ + for (y = 0; y < h; y += 16) {\ + for (x = 0; x < w; x += 16) {\ + sa8d += xavs2_pixel_sa8d10_16x16(pix1 + y * i_pix1 + x, i_pix1,\ + pix2 + y * i_pix2 + x, i_pix2);\ + }\ + }\ + return sa8d;\ +} + +#define xavs2_pixel_sa8d10_4x4 xavs2_pixel_satd10_4x4 +#define xavs2_pixel_sa8d10_4x8 xavs2_pixel_satd10_4x8 +#define xavs2_pixel_sa8d10_8x4 xavs2_pixel_satd10_8x4 +#define xavs2_pixel_sa8d10_16x4 xavs2_pixel_satd10_16x4 +#define xavs2_pixel_sa8d10_4x16 xavs2_pixel_satd10_4x16 +#define xavs2_pixel_sa8d10_12x16 xavs2_pixel_satd10_12x16 +#define xavs2_pixel_sa8d10_16x12 xavs2_pixel_satd10_16x12 +PIXEL_SA8D10_C8(8, 16) +PIXEL_SA8D10_C8(8, 32) +PIXEL_SA8D10_C8(16, 8) +PIXEL_SA8D10_C8(32, 8) +PIXEL_SA8D10_C16(32, 16) +PIXEL_SA8D10_C8(32, 24) +PIXEL_SA8D10_C8(24, 32) +PIXEL_SA8D10_C16(32, 32) +PIXEL_SA8D10_C16(16, 32) +PIXEL_SA8D10_C16(64, 16) +PIXEL_SA8D10_C16(64, 32) +PIXEL_SA8D10_C16(64, 48) +PIXEL_SA8D10_C16(16, 64) +PIXEL_SA8D10_C16(32, 64) +PIXEL_SA8D10_C16(48, 64) +PIXEL_SA8D10_C16(64, 64) + /** * --------------------------------------------------------------------------- * SSD * --------------------------------------------------------------------------- */ -dist_t xavs2_get_block_ssd_c(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height) +dist_t xavs2_get_block_ssd8_c(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2, int width, int height) +{ + dist_t sum = 0; + int x, y, tmp; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + tmp = pix1[x] - pix2[x]; + sum += (tmp * tmp); + } + pix1 += i_pix1; + pix2 += i_pix2; + } + return sum; +} + +dist_t xavs2_get_block_ssd10_c(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2, int width, int height) { dist_t sum = 0; int x, y, tmp; @@ -549,8 +914,50 @@ dist_t xavs2_get_block_ssd_c(const pel_t *pix1, intptr_t i_pix1, const pel_t *pi return sum; } -#define PIXEL_SSD_C(w, h) \ -static dist_t xavs2_pixel_ssd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ +#define PIXEL_SSD8_C(w, h) \ +static dist_t xavs2_pixel_ssd8_##w##x##h(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2)\ +{\ + dist_t sum = 0;\ + int x, y, tmp;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + tmp = pix1[x] - pix2[x];\ + sum += (tmp * tmp);\ + }\ + pix1 += i_pix1;\ + pix2 += i_pix2;\ + }\ + return sum;\ +} + +PIXEL_SSD8_C(64, 64) /* 64x64 */ +PIXEL_SSD8_C(64, 32) +PIXEL_SSD8_C(32, 64) +PIXEL_SSD8_C(64, 16) +PIXEL_SSD8_C(64, 48) +PIXEL_SSD8_C(16, 64) +PIXEL_SSD8_C(48, 64) +PIXEL_SSD8_C(32, 32) /* 32x32 */ +PIXEL_SSD8_C(32, 16) +PIXEL_SSD8_C(16, 32) +PIXEL_SSD8_C(32, 8) +PIXEL_SSD8_C(32, 24) +PIXEL_SSD8_C( 8, 32) +PIXEL_SSD8_C(24, 32) +PIXEL_SSD8_C(16, 16) /* 16x16 */ +PIXEL_SSD8_C(16, 8) +PIXEL_SSD8_C( 8, 16) +PIXEL_SSD8_C(16, 4) +PIXEL_SSD8_C(16, 12) +PIXEL_SSD8_C( 4, 16) +PIXEL_SSD8_C(12, 16) +PIXEL_SSD8_C( 8, 8) /* 8x8 */ +PIXEL_SSD8_C( 8, 4) +PIXEL_SSD8_C( 4, 8) +PIXEL_SSD8_C( 4, 4) /* 4x4 */ + +#define PIXEL_SSD10_C(w, h) \ +static dist_t xavs2_pixel_ssd10_##w##x##h(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2)\ {\ dist_t sum = 0;\ int x, y, tmp;\ @@ -565,46 +972,46 @@ static dist_t xavs2_pixel_ssd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, cons return sum;\ } -PIXEL_SSD_C(64, 64) /* 64x64 */ -PIXEL_SSD_C(64, 32) -PIXEL_SSD_C(32, 64) -PIXEL_SSD_C(64, 16) -PIXEL_SSD_C(64, 48) -PIXEL_SSD_C(16, 64) -PIXEL_SSD_C(48, 64) -PIXEL_SSD_C(32, 32) /* 32x32 */ -PIXEL_SSD_C(32, 16) -PIXEL_SSD_C(16, 32) -PIXEL_SSD_C(32, 8) -PIXEL_SSD_C(32, 24) -PIXEL_SSD_C( 8, 32) -PIXEL_SSD_C(24, 32) -PIXEL_SSD_C(16, 16) /* 16x16 */ -PIXEL_SSD_C(16, 8) -PIXEL_SSD_C( 8, 16) -PIXEL_SSD_C(16, 4) -PIXEL_SSD_C(16, 12) -PIXEL_SSD_C( 4, 16) -PIXEL_SSD_C(12, 16) -PIXEL_SSD_C( 8, 8) /* 8x8 */ -PIXEL_SSD_C( 8, 4) -PIXEL_SSD_C( 4, 8) -PIXEL_SSD_C( 4, 4) /* 4x4 */ +PIXEL_SSD10_C(64, 64) /* 64x64 */ +PIXEL_SSD10_C(64, 32) +PIXEL_SSD10_C(32, 64) +PIXEL_SSD10_C(64, 16) +PIXEL_SSD10_C(64, 48) +PIXEL_SSD10_C(16, 64) +PIXEL_SSD10_C(48, 64) +PIXEL_SSD10_C(32, 32) /* 32x32 */ +PIXEL_SSD10_C(32, 16) +PIXEL_SSD10_C(16, 32) +PIXEL_SSD10_C(32, 8) +PIXEL_SSD10_C(32, 24) +PIXEL_SSD10_C( 8, 32) +PIXEL_SSD10_C(24, 32) +PIXEL_SSD10_C(16, 16) /* 16x16 */ +PIXEL_SSD10_C(16, 8) +PIXEL_SSD10_C( 8, 16) +PIXEL_SSD10_C(16, 4) +PIXEL_SSD10_C(16, 12) +PIXEL_SSD10_C( 4, 16) +PIXEL_SSD10_C(12, 16) +PIXEL_SSD10_C( 8, 8) /* 8x8 */ +PIXEL_SSD10_C( 8, 4) +PIXEL_SSD10_C( 4, 8) +PIXEL_SSD10_C( 4, 4) /* 4x4 */ /* --------------------------------------------------------------------------- * ssd for one plane of frame */ #if XAVS2_STAT -uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, - pel_t *p_pix1, intptr_t i_pix1, - pel_t *p_pix2, intptr_t i_pix2, +uint64_t xavs2_pixel_ssd8_wxh(pixel_funcs_t *pf, + pel8_t *p_pix1, intptr_t i_pix1, + pel8_t *p_pix2, intptr_t i_pix2, int i_width, int i_height, int inout_shift) { uint64_t i_ssd = 0; int align = !(((intptr_t)p_pix1 | (intptr_t)p_pix2 | i_pix1 | i_pix2) & 15); int x, y; - pixel_ssd_t cal_ssd[2]; + pixel8_ssd_t cal_ssd[2]; if (inout_shift > 0) { int inout_offset = 1 << (inout_shift - 1); @@ -618,8 +1025,8 @@ uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, p_pix2 += i_pix2; } } else { - cal_ssd[0] = pf->ssd[LUMA_8x8]; /* 8 x 8 */ - cal_ssd[1] = pf->ssd[LUMA_16x16]; /* 16 x 16 */ + cal_ssd[0] = pf->ssd8[LUMA_8x8]; /* 8 x 8 */ + cal_ssd[1] = pf->ssd8[LUMA_16x16]; /* 16 x 16 */ #define SSD(id) i_ssd += cal_ssd[id](p_pix1 + y*i_pix1 + x, i_pix1, p_pix2 + y*i_pix2 + x, i_pix2) @@ -668,17 +1075,132 @@ uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, return i_ssd; } -#endif +uint64_t xavs2_pixel_ssd10_wxh(pixel_funcs_t *pf, + pel10_t *p_pix1, intptr_t i_pix1, + pel10_t *p_pix2, intptr_t i_pix2, + int i_width, int i_height, + int inout_shift) +{ + uint64_t i_ssd = 0; + int align = !(((intptr_t)p_pix1 | (intptr_t)p_pix2 | i_pix1 | i_pix2) & 15); + int x, y; + pixel10_ssd_t cal_ssd[2]; + + if (inout_shift > 0) { + int inout_offset = 1 << (inout_shift - 1); + for (y = 0; y < i_height; y++) { + for (x = 0; x < i_width; x++) { + int d = ((p_pix1[x] + inout_offset) >> inout_shift) - ((p_pix2[x] + inout_offset) >> inout_shift); + i_ssd += d * d; + } + p_pix1 += i_pix1; + p_pix2 += i_pix2; + } + } else { + cal_ssd[0] = pf->ssd10[LUMA_8x8]; /* 8 x 8 */ + cal_ssd[1] = pf->ssd10[LUMA_16x16]; /* 16 x 16 */ + +#define SSD(id) i_ssd += cal_ssd[id](p_pix1 + y*i_pix1 + x, i_pix1, p_pix2 + y*i_pix2 + x, i_pix2) + + for (y = 0; y < i_height - 15;) { + if (align) { + for (x = 0; x < i_width - 15; x += 16) { + SSD(1); /* 16x16 */ + } + y += 16; + } else { + for (x = 0; x < i_width - 7; x += 8) { + SSD(0); /* 8x8 */ + } + y += 8; + for (x = 0; x < i_width - 7; x += 8) { + SSD(0); /* 8x8 */ + } + y += 8; + } + } + if (y < i_height - 7) { + for (x = 0; x < i_width - 7; x += 8) { + SSD(0); /* 8x8 */ + } + } +#undef SSD + + /* sum the rest ssd */ +#define SSD1 { int d = p_pix1[y*i_pix1+x] - p_pix2[y*i_pix2+x]; i_ssd += d*d; } + if (i_width & 7) { + for (y = 0; y < (i_height & ~7); y++) { + for (x = i_width & ~7; x < i_width; x++) { + SSD1; + } + } + } + if (i_height & 7) { + for (y = i_height & ~7; y < i_height; y++) { + for (x = 0; x < i_width; x++) { + SSD1; + } + } + } +#undef SSD1 + } + + return i_ssd; +} +#endif + +//#if !HIGH_BIT_DEPTH /** * --------------------------------------------------------------------------- * AVG * --------------------------------------------------------------------------- */ -#define PIXEL_AVG_C(w, h) \ -static void xavs2_pixel_avg_##w##x##h(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight)\ +#define PIXEL_AVG8_C(w, h) \ +static void xavs2_pixel_avg8_##w##x##h(pel8_t* dst, intptr_t dstride, const pel8_t* src0, intptr_t sstride0, const pel8_t* src1, intptr_t sstride1, int weight)\ +{\ + int x, y;\ + UNUSED_PARAMETER(weight); \ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + dst[x] = (src0[x] + src1[x] + 1) >> 1;\ + }\ + dst += dstride;\ + src0 += sstride0;\ + src1 += sstride1;\ + }\ +} + +PIXEL_AVG8_C(64, 64) /* 64x64 */ +PIXEL_AVG8_C(64, 32) +PIXEL_AVG8_C(32, 64) +PIXEL_AVG8_C(64, 16) +PIXEL_AVG8_C(64, 48) +PIXEL_AVG8_C(16, 64) +PIXEL_AVG8_C(48, 64) +PIXEL_AVG8_C(32, 32) /* 32x32 */ +PIXEL_AVG8_C(32, 16) +PIXEL_AVG8_C(16, 32) +PIXEL_AVG8_C(32, 8) +PIXEL_AVG8_C(32, 24) +PIXEL_AVG8_C( 8, 32) +PIXEL_AVG8_C(24, 32) +PIXEL_AVG8_C(16, 16) /* 16x16 */ +PIXEL_AVG8_C(16, 8) +PIXEL_AVG8_C( 8, 16) +PIXEL_AVG8_C(16, 4) +PIXEL_AVG8_C(16, 12) +PIXEL_AVG8_C( 4, 16) +PIXEL_AVG8_C(12, 16) +PIXEL_AVG8_C( 8, 8) /* 8x8 */ +PIXEL_AVG8_C( 8, 4) +PIXEL_AVG8_C( 4, 8) +PIXEL_AVG8_C( 4, 4) /* 4x4 */ + +#define PIXEL_AVG10_C(w, h) \ +static void xavs2_pixel_avg10_##w##x##h(pel10_t* dst, intptr_t dstride, const pel10_t* src0, intptr_t sstride0, const pel10_t* src1, intptr_t sstride1, int weight)\ {\ int x, y;\ UNUSED_PARAMETER(weight); \ @@ -692,31 +1214,32 @@ static void xavs2_pixel_avg_##w##x##h(pel_t* dst, intptr_t dstride, const pel_t* }\ } -PIXEL_AVG_C(64, 64) /* 64x64 */ -PIXEL_AVG_C(64, 32) -PIXEL_AVG_C(32, 64) -PIXEL_AVG_C(64, 16) -PIXEL_AVG_C(64, 48) -PIXEL_AVG_C(16, 64) -PIXEL_AVG_C(48, 64) -PIXEL_AVG_C(32, 32) /* 32x32 */ -PIXEL_AVG_C(32, 16) -PIXEL_AVG_C(16, 32) -PIXEL_AVG_C(32, 8) -PIXEL_AVG_C(32, 24) -PIXEL_AVG_C( 8, 32) -PIXEL_AVG_C(24, 32) -PIXEL_AVG_C(16, 16) /* 16x16 */ -PIXEL_AVG_C(16, 8) -PIXEL_AVG_C( 8, 16) -PIXEL_AVG_C(16, 4) -PIXEL_AVG_C(16, 12) -PIXEL_AVG_C( 4, 16) -PIXEL_AVG_C(12, 16) -PIXEL_AVG_C( 8, 8) /* 8x8 */ -PIXEL_AVG_C( 8, 4) -PIXEL_AVG_C( 4, 8) -PIXEL_AVG_C( 4, 4) /* 4x4 */ +PIXEL_AVG10_C(64, 64) /* 64x64 */ +PIXEL_AVG10_C(64, 32) +PIXEL_AVG10_C(32, 64) +PIXEL_AVG10_C(64, 16) +PIXEL_AVG10_C(64, 48) +PIXEL_AVG10_C(16, 64) +PIXEL_AVG10_C(48, 64) +PIXEL_AVG10_C(32, 32) /* 32x32 */ +PIXEL_AVG10_C(32, 16) +PIXEL_AVG10_C(16, 32) +PIXEL_AVG10_C(32, 8) +PIXEL_AVG10_C(32, 24) +PIXEL_AVG10_C( 8, 32) +PIXEL_AVG10_C(24, 32) +PIXEL_AVG10_C(16, 16) /* 16x16 */ +PIXEL_AVG10_C(16, 8) +PIXEL_AVG10_C( 8, 16) +PIXEL_AVG10_C(16, 4) +PIXEL_AVG10_C(16, 12) +PIXEL_AVG10_C( 4, 16) +PIXEL_AVG10_C(12, 16) +PIXEL_AVG10_C( 8, 8) /* 8x8 */ +PIXEL_AVG10_C( 8, 4) +PIXEL_AVG10_C( 4, 8) +PIXEL_AVG10_C( 4, 4) /* 4x4 */ +//#endif /** @@ -724,8 +1247,8 @@ PIXEL_AVG_C( 4, 4) /* 4x4 */ * block operation: copy/add/sub (p: pixel, s: short) * --------------------------------------------------------------------------- */ -#define BLOCKCOPY_PP_C(w, h) \ -static void xavs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\ +#define BLOCKCOPY_PP8_C(w, h) \ +static void xavs2_blockcopy_pp8_##w##x##h(pel8_t *a, intptr_t stridea, const pel8_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ @@ -737,8 +1260,8 @@ static void xavs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t }\ } -#define BLOCKCOPY_SS_C(w, h) \ -static void xavs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ +#define BLOCKCOPY_PP10_C(w, h) \ +static void xavs2_blockcopy_pp10_##w##x##h(pel10_t *a, intptr_t stridea, const pel10_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ @@ -750,22 +1273,62 @@ static void xavs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coe }\ } -#define BLOCKCOPY_SP_C(w, h) \ -static void xavs2_blockcopy_sp_##w##x##h(pel_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ +#define BLOCKCOPY_SS8_C(w, h) \ +static void xavs2_blockcopy_ss8_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + a[x] = b[x];\ + }\ + a += stridea;\ + b += strideb;\ + }\ +} + +#define BLOCKCOPY_SS10_C(w, h) \ +static void xavs2_blockcopy_ss10_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + a[x] = b[x];\ + }\ + a += stridea;\ + b += strideb;\ + }\ +} + +#define BLOCKCOPY_SP8_C(w, h) \ +static void xavs2_blockcopy_sp8_##w##x##h(pel8_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + assert((b[x] >= 0) && (b[x] <= ((1 << 8) - 1)));\ + a[x] = (pel8_t)b[x];\ + }\ + a += stridea;\ + b += strideb;\ + }\ +} + +#define BLOCKCOPY_SP10_C(w, h) \ +static void xavs2_blockcopy_sp10_##w##x##h(pel10_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ assert((b[x] >= 0) && (b[x] <= ((1 << 8) - 1)));\ - a[x] = (pel_t)b[x];\ + a[x] = (pel10_t)b[x];\ }\ a += stridea;\ b += strideb;\ }\ } -#define BLOCKCOPY_PS_C(w, h) \ -static void xavs2_blockcopy_ps_##w##x##h(coeff_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\ +#define BLOCKCOPY_PS8_C(w, h) \ +static void xavs2_blockcopy_ps8_##w##x##h(coeff_t *a, intptr_t stridea, const pel8_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ @@ -775,10 +1338,37 @@ static void xavs2_blockcopy_ps_##w##x##h(coeff_t *a, intptr_t stridea, const pel a += stridea;\ b += strideb;\ }\ -}\ - -#define PIXEL_SUB_PS_C(w, h) \ -static void xavs2_pixel_sub_ps_##w##x##h(coeff_t *a, intptr_t dstride, const pel_t *b0, const pel_t *b1, intptr_t sstride0, intptr_t sstride1)\ +} + +#define BLOCKCOPY_PS10_C(w, h) \ +static void xavs2_blockcopy_ps10_##w##x##h(coeff_t *a, intptr_t stridea, const pel10_t *b, intptr_t strideb)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + a[x] = (int16_t)b[x];\ + }\ + a += stridea;\ + b += strideb;\ + }\ +} + +#define PIXEL_SUB_PS8_C(w, h) \ +static void xavs2_pixel_sub_ps8_##w##x##h(coeff_t *a, intptr_t dstride, const pel8_t *b0, const pel8_t *b1, intptr_t sstride0, intptr_t sstride1)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + a[x] = (int16_t)(b0[x] - b1[x]);\ + }\ + b0 += sstride0;\ + b1 += sstride1;\ + a += dstride;\ + }\ +} + +#define PIXEL_SUB_PS10_C(w, h) \ +static void xavs2_pixel_sub_ps10_##w##x##h(coeff_t *a, intptr_t dstride, const pel10_t *b0, const pel10_t *b1, intptr_t sstride0, intptr_t sstride1)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ @@ -791,13 +1381,15 @@ static void xavs2_pixel_sub_ps_##w##x##h(coeff_t *a, intptr_t dstride, const pel }\ } -#define PIXEL_ADD_PS_C(w, h) \ -static void xavs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\ +#define XAVS2_CLIP1(cc, bb) ((cc) > ((1 << bb->param->input_sample_bit_depth) - 1) ? ((1 << bb->param->input_sample_bit_depth) - 1) : ((cc) < 0 ? 0 : (cc))) + +#define PIXEL_ADD_PS8_C(w, h) \ +static void xavs2_pixel_add_ps8_##w##x##h(xavs2_t* bb, pel8_t *a, intptr_t dstride, const pel8_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ - a[x] = (pel_t)XAVS2_CLIP1(b0[x] + b1[x]);\ + a[x] = (pel8_t)XAVS2_CLIP1(b0[x] + b1[x], bb);\ }\ b0 += sstride0;\ b1 += sstride1;\ @@ -805,49 +1397,98 @@ static void xavs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t }\ } -#define BLOCK_OP_C(w, h) \ - BLOCKCOPY_PP_C(w, h);\ - BLOCKCOPY_SS_C(w, h);\ - BLOCKCOPY_SP_C(w, h);\ - BLOCKCOPY_PS_C(w, h);\ - PIXEL_SUB_PS_C(w, h);\ - PIXEL_ADD_PS_C(w, h); - -BLOCK_OP_C(64, 64) /* 64x64 */ -BLOCK_OP_C(64, 32) -BLOCK_OP_C(32, 64) -BLOCK_OP_C(64, 16) -BLOCK_OP_C(64, 48) -BLOCK_OP_C(16, 64) -BLOCK_OP_C(48, 64) -BLOCK_OP_C(32, 32) /* 32x32 */ -BLOCK_OP_C(32, 16) -BLOCK_OP_C(16, 32) -BLOCK_OP_C(32, 8) -BLOCK_OP_C(32, 24) -BLOCK_OP_C( 8, 32) -BLOCK_OP_C(24, 32) -BLOCK_OP_C(16, 16) /* 16x16 */ -BLOCK_OP_C(16, 8) -BLOCK_OP_C( 8, 16) -BLOCK_OP_C(16, 4) -BLOCK_OP_C(16, 12) -BLOCK_OP_C( 4, 16) -BLOCK_OP_C(12, 16) -BLOCK_OP_C( 8, 8) /* 8x8 */ -BLOCK_OP_C( 8, 4) -BLOCK_OP_C( 4, 8) -BLOCK_OP_C( 4, 4) /* 4x4 */ +#define BLOCK_OP8_C(w, h) \ + BLOCKCOPY_PP8_C(w, h);\ + BLOCKCOPY_SS8_C(w, h);\ + BLOCKCOPY_SP8_C(w, h);\ + BLOCKCOPY_PS8_C(w, h);\ + PIXEL_SUB_PS8_C(w, h);\ + PIXEL_ADD_PS8_C(w, h); + +BLOCK_OP8_C(64, 64) /* 64x64 */ +BLOCK_OP8_C(64, 32) +BLOCK_OP8_C(32, 64) +BLOCK_OP8_C(64, 16) +BLOCK_OP8_C(64, 48) +BLOCK_OP8_C(16, 64) +BLOCK_OP8_C(48, 64) +BLOCK_OP8_C(32, 32) /* 32x32 */ +BLOCK_OP8_C(32, 16) +BLOCK_OP8_C(16, 32) +BLOCK_OP8_C(32, 8) +BLOCK_OP8_C(32, 24) +BLOCK_OP8_C( 8, 32) +BLOCK_OP8_C(24, 32) +BLOCK_OP8_C(16, 16) /* 16x16 */ +BLOCK_OP8_C(16, 8) +BLOCK_OP8_C( 8, 16) +BLOCK_OP8_C(16, 4) +BLOCK_OP8_C(16, 12) +BLOCK_OP8_C( 4, 16) +BLOCK_OP8_C(12, 16) +BLOCK_OP8_C( 8, 8) /* 8x8 */ +BLOCK_OP8_C( 8, 4) +BLOCK_OP8_C( 4, 8) +BLOCK_OP8_C( 4, 4) /* 4x4 */ + +#define PIXEL_ADD_PS10_C(w, h) \ +static void xavs2_pixel_add_ps10_##w##x##h(xavs2_t* bb, pel10_t *a, intptr_t dstride, const pel10_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\ +{\ + int x, y;\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w; x++) {\ + a[x] = (pel10_t)XAVS2_CLIP1(b0[x] + b1[x], bb);\ + }\ + b0 += sstride0;\ + b1 += sstride1;\ + a += dstride;\ + }\ +} +#define BLOCK_OP10_C(w, h) \ + BLOCKCOPY_PP10_C(w, h);\ + BLOCKCOPY_SS10_C(w, h);\ + BLOCKCOPY_SP10_C(w, h);\ + BLOCKCOPY_PS10_C(w, h);\ + PIXEL_SUB_PS10_C(w, h);\ + PIXEL_ADD_PS10_C(w, h); + +BLOCK_OP10_C(64, 64) /* 64x64 */ +BLOCK_OP10_C(64, 32) +BLOCK_OP10_C(32, 64) +BLOCK_OP10_C(64, 16) +BLOCK_OP10_C(64, 48) +BLOCK_OP10_C(16, 64) +BLOCK_OP10_C(48, 64) +BLOCK_OP10_C(32, 32) /* 32x32 */ +BLOCK_OP10_C(32, 16) +BLOCK_OP10_C(16, 32) +BLOCK_OP10_C(32, 8) +BLOCK_OP10_C(32, 24) +BLOCK_OP10_C( 8, 32) +BLOCK_OP10_C(24, 32) +BLOCK_OP10_C(16, 16) /* 16x16 */ +BLOCK_OP10_C(16, 8) +BLOCK_OP10_C( 8, 16) +BLOCK_OP10_C(16, 4) +BLOCK_OP10_C(16, 12) +BLOCK_OP10_C( 4, 16) +BLOCK_OP10_C(12, 16) +BLOCK_OP10_C( 8, 8) /* 8x8 */ +BLOCK_OP10_C( 8, 4) +BLOCK_OP10_C( 4, 8) +BLOCK_OP10_C( 4, 4) /* 4x4 */ + +//#if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- */ -static void xavs2_pixel_average(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height) +static void xavs2_pixel_average8(pel8_t *dst, int i_dst, pel8_t *src1, int i_src1, pel8_t *src2, int i_src2, int width, int height) { int i, j; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { - dst[j] = (pel_t)((src1[j] + src2[j] + 1) >> 1); + dst[j] = (pel8_t)((src1[j] + src2[j] + 1) >> 1); } dst += i_dst; src1 += i_src1; @@ -855,10 +1496,25 @@ static void xavs2_pixel_average(pel_t *dst, int i_dst, pel_t *src1, int i_src1, } } +static void xavs2_pixel_average10(pel10_t *dst, int i_dst, pel10_t *src1, int i_src1, pel10_t *src2, int i_src2, int width, int height) +{ + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + dst[j] = (pel10_t)((src1[j] + src2[j] + 1) >> 1); + } + dst += i_dst; + src1 += i_src1; + src2 += i_src2; + } +} +//#endif + /* --------------------------------------------------------------------------- * init functions of block operation : copy / add / sub */ -static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) +static void init_block_opreation_funcs(xavs2_param_t* param, uint32_t cpuid, pixel_funcs_t* pixf) { #define ALL_LUMA_CU(name1, name2, cpu) \ pixf->name1[LUMA_64x64] = xavs2_ ## name2 ## _64x64 ## cpu;\ @@ -897,27 +1553,104 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) /* ------------------------------------------------------------- * init all c functions */ + if (param->input_sample_bit_depth == 8) { //ALL_LUMA_CU(add_ps, pixel_add_ps, ); - ALL_LUMA_PU(add_ps, pixel_add_ps, ); + ALL_LUMA_PU(add_ps8, pixel_add_ps8, ); // ALL_LUMA_CU(sub_ps, pixel_sub_ps, ); - ALL_LUMA_PU(sub_ps, pixel_sub_ps, ); - ALL_LUMA_PU(copy_sp, blockcopy_sp, ); - ALL_LUMA_PU(copy_ps, blockcopy_ps, ); - ALL_LUMA_PU(copy_ss, blockcopy_ss, ); - ALL_LUMA_PU(copy_pp, blockcopy_pp, ); - pixf->ssd_block = xavs2_get_block_ssd_c; + ALL_LUMA_PU(sub_ps8, pixel_sub_ps8, ); + ALL_LUMA_PU(copy_sp8, blockcopy_sp8, ); + ALL_LUMA_PU(copy_ps8, blockcopy_ps8, ); + ALL_LUMA_PU(copy_ss8, blockcopy_ss8, ); + ALL_LUMA_PU(copy_pp8, blockcopy_pp8, ); +//#if !HIGH_BIT_DEPTH + pixf->ssd_block8 = xavs2_get_block_ssd8_c; +//#endif + } else { + //ALL_LUMA_CU(add_ps10, pixel_add_ps10, ); + ALL_LUMA_PU(add_ps10, pixel_add_ps10, ); +// ALL_LUMA_CU(sub_ps10, pixel_sub_ps10, ); + ALL_LUMA_PU(sub_ps10, pixel_sub_ps10, ); + ALL_LUMA_PU(copy_sp10, blockcopy_sp10, ); + ALL_LUMA_PU(copy_ps10, blockcopy_ps10, ); + ALL_LUMA_PU(copy_ss10, blockcopy_ss10, ); + ALL_LUMA_PU(copy_pp10, blockcopy_pp10, ); +//#if !HIGH_BIT_DEPTH + pixf->ssd_block10 = xavs2_get_block_ssd10_c; +//#endif + } /* ------------------------------------------------------------- * init all SIMD functions */ #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE2) { +#if HIGH_BIT_DEPTH + //10bit assemble + if (sizeof(pel_t) == sizeof(int16_t) && cpuid) { + pixf->copy_pp[LUMA_64x64] = (copy_pp_t)xavs2_blockcopy_ss_64x64_sse2; /* 64x64 */ + pixf->copy_pp[LUMA_64x32] = (copy_pp_t)xavs2_blockcopy_ss_64x32_sse2; + pixf->copy_pp[LUMA_32x64] = (copy_pp_t)xavs2_blockcopy_ss_32x64_sse2; + pixf->copy_pp[LUMA_64x16] = (copy_pp_t)xavs2_blockcopy_ss_64x16_sse2; + pixf->copy_pp[LUMA_64x48] = (copy_pp_t)xavs2_blockcopy_ss_64x48_sse2; + pixf->copy_pp[LUMA_16x64] = (copy_pp_t)xavs2_blockcopy_ss_16x64_sse2; + pixf->copy_pp[LUMA_48x64] = (copy_pp_t)xavs2_blockcopy_ss_48x64_sse2; + pixf->copy_pp[LUMA_32x32] = (copy_pp_t)xavs2_blockcopy_ss_32x32_sse2; /* 32x32 */ + pixf->copy_pp[LUMA_32x16] = (copy_pp_t)xavs2_blockcopy_ss_32x16_sse2; + pixf->copy_pp[LUMA_16x32] = (copy_pp_t)xavs2_blockcopy_ss_16x32_sse2; + pixf->copy_pp[LUMA_32x8 ] = (copy_pp_t)xavs2_blockcopy_ss_32x8_sse2; + pixf->copy_pp[LUMA_32x24] = (copy_pp_t)xavs2_blockcopy_ss_32x24_sse2; + pixf->copy_pp[LUMA_8x32 ] = (copy_pp_t)xavs2_blockcopy_ss_8x32_sse2; + pixf->copy_pp[LUMA_24x32] = (copy_pp_t)xavs2_blockcopy_ss_24x32_sse2; + pixf->copy_pp[LUMA_16x16] = (copy_pp_t)xavs2_blockcopy_ss_16x16_sse2; /* 16x16 */ + pixf->copy_pp[LUMA_16x8 ] = (copy_pp_t)xavs2_blockcopy_ss_16x8_sse2; + pixf->copy_pp[LUMA_8x16 ] = (copy_pp_t)xavs2_blockcopy_ss_8x16_sse2; + pixf->copy_pp[LUMA_16x4 ] = (copy_pp_t)xavs2_blockcopy_ss_16x4_sse2; + pixf->copy_pp[LUMA_16x12] = (copy_pp_t)xavs2_blockcopy_ss_16x12_sse2; + pixf->copy_pp[LUMA_4x16 ] = (copy_pp_t)xavs2_blockcopy_ss_4x16_sse2; + pixf->copy_pp[LUMA_12x16] = (copy_pp_t)xavs2_blockcopy_ss_12x16_sse2; + pixf->copy_pp[LUMA_8x8 ] = (copy_pp_t)xavs2_blockcopy_ss_8x8_sse2; /* 8x8 */ + pixf->copy_pp[LUMA_8x4 ] = (copy_pp_t)xavs2_blockcopy_ss_8x4_sse2; + pixf->copy_pp[LUMA_4x8 ] = (copy_pp_t)xavs2_blockcopy_ss_4x8_sse2; + pixf->copy_pp[LUMA_4x4 ] = (copy_pp_t)xavs2_blockcopy_ss_4x4_sse2; /* 4x4 */ + } + if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) { + pixf->copy_ss[LUMA_64x64] = (copy_ss_t)xavs2_blockcopy_ss_64x64_sse2; /* 64x64 */ + pixf->copy_ss[LUMA_64x32] = (copy_ss_t)xavs2_blockcopy_ss_64x32_sse2; + pixf->copy_ss[LUMA_32x64] = (copy_ss_t)xavs2_blockcopy_ss_32x64_sse2; + pixf->copy_ss[LUMA_64x16] = (copy_ss_t)xavs2_blockcopy_ss_64x16_sse2; + pixf->copy_ss[LUMA_64x48] = (copy_ss_t)xavs2_blockcopy_ss_64x48_sse2; + pixf->copy_ss[LUMA_16x64] = (copy_ss_t)xavs2_blockcopy_ss_16x64_sse2; + pixf->copy_ss[LUMA_48x64] = (copy_ss_t)xavs2_blockcopy_ss_48x64_sse2; + pixf->copy_ss[LUMA_32x32] = (copy_ss_t)xavs2_blockcopy_ss_32x32_sse2; /* 32x32 */ + pixf->copy_ss[LUMA_32x16] = (copy_ss_t)xavs2_blockcopy_ss_32x16_sse2; + pixf->copy_ss[LUMA_16x32] = (copy_ss_t)xavs2_blockcopy_ss_16x32_sse2; + pixf->copy_ss[LUMA_32x8 ] = (copy_ss_t)xavs2_blockcopy_ss_32x8_sse2; + pixf->copy_ss[LUMA_32x24] = (copy_ss_t)xavs2_blockcopy_ss_32x24_sse2; + pixf->copy_ss[LUMA_8x32 ] = (copy_ss_t)xavs2_blockcopy_ss_8x32_sse2; + pixf->copy_ss[LUMA_24x32] = (copy_ss_t)xavs2_blockcopy_ss_24x32_sse2; + pixf->copy_ss[LUMA_16x16] = (copy_ss_t)xavs2_blockcopy_ss_16x16_sse2; /* 16x16 */ + pixf->copy_ss[LUMA_16x8 ] = (copy_ss_t)xavs2_blockcopy_ss_16x8_sse2; + pixf->copy_ss[LUMA_8x16 ] = (copy_ss_t)xavs2_blockcopy_ss_8x16_sse2; + pixf->copy_ss[LUMA_16x4 ] = (copy_ss_t)xavs2_blockcopy_ss_16x4_sse2; + pixf->copy_ss[LUMA_16x12] = (copy_ss_t)xavs2_blockcopy_ss_16x12_sse2; + pixf->copy_ss[LUMA_4x16 ] = (copy_ss_t)xavs2_blockcopy_ss_4x16_sse2; + pixf->copy_ss[LUMA_12x16] = (copy_ss_t)xavs2_blockcopy_ss_12x16_sse2; + pixf->copy_ss[LUMA_8x8 ] = (copy_ss_t)xavs2_blockcopy_ss_8x8_sse2; /* 8x8 */ + pixf->copy_ss[LUMA_8x4 ] = (copy_ss_t)xavs2_blockcopy_ss_8x4_sse2; + pixf->copy_ss[LUMA_4x8 ] = (copy_ss_t)xavs2_blockcopy_ss_4x8_sse2; + pixf->copy_ss[LUMA_4x4 ] = (copy_ss_t)xavs2_blockcopy_ss_4x4_sse2; /* 4x4 */ + } +#else ALL_LUMA_PU(copy_sp, blockcopy_sp, _sse2); ALL_LUMA_PU(copy_ss, blockcopy_ss, _sse2); ALL_LUMA_PU(copy_pp, blockcopy_pp, _sse2); +#endif } if (cpuid & XAVS2_CPU_SSE4) { +#if HIGH_BIT_DEPTH + //10bit assemble +#else pixf->add_ps [LUMA_4x4 ] = xavs2_pixel_add_ps_4x4_sse4; pixf->add_ps [LUMA_4x8 ] = xavs2_pixel_add_ps_4x8_sse4; pixf->add_ps [LUMA_4x16 ] = xavs2_pixel_add_ps_4x16_sse4; @@ -961,9 +1694,51 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sub_ps [LUMA_64x64] = xavs2_pixel_sub_ps_64x64_sse4; ALL_LUMA_PU(copy_ps, blockcopy_ps, _sse4); +#endif } - if (cpuid & XAVS2_CPU_AVX) { + if (cpuid & XAVS2_CPU_AVX2) { +#if HIGH_BIT_DEPTH + //10bit assemble + if (sizeof(pel_t) == sizeof(int16_t) && cpuid) { + pixf->copy_pp[LUMA_64x64] = (copy_pp_t)xavs2_blockcopy_ss_64x64_avx; + pixf->copy_pp[LUMA_64x32] = (copy_pp_t)xavs2_blockcopy_ss_64x32_avx; + pixf->copy_pp[LUMA_32x64] = (copy_pp_t)xavs2_blockcopy_ss_32x64_avx; + pixf->copy_pp[LUMA_64x16] = (copy_pp_t)xavs2_blockcopy_ss_64x16_avx; + pixf->copy_pp[LUMA_64x48] = (copy_pp_t)xavs2_blockcopy_ss_64x48_avx; + pixf->copy_pp[LUMA_16x64] = (copy_pp_t)xavs2_blockcopy_ss_16x64_avx; + pixf->copy_pp[LUMA_48x64] = (copy_pp_t)xavs2_blockcopy_ss_48x64_avx; + pixf->copy_pp[LUMA_32x32] = (copy_pp_t)xavs2_blockcopy_ss_32x32_avx; + pixf->copy_pp[LUMA_32x16] = (copy_pp_t)xavs2_blockcopy_ss_32x16_avx; + pixf->copy_pp[LUMA_16x32] = (copy_pp_t)xavs2_blockcopy_ss_16x32_avx; + pixf->copy_pp[LUMA_32x8 ] = (copy_pp_t)xavs2_blockcopy_ss_32x8_avx; + pixf->copy_pp[LUMA_32x24] = (copy_pp_t)xavs2_blockcopy_ss_32x24_avx; + pixf->copy_pp[LUMA_24x32] = (copy_pp_t)xavs2_blockcopy_ss_24x32_avx; + pixf->copy_pp[LUMA_16x16] = (copy_pp_t)xavs2_blockcopy_ss_16x16_avx; + pixf->copy_pp[LUMA_16x8 ] = (copy_pp_t)xavs2_blockcopy_ss_16x8_avx; + pixf->copy_pp[LUMA_16x4 ] = (copy_pp_t)xavs2_blockcopy_ss_16x4_avx; + pixf->copy_pp[LUMA_16x12] = (copy_pp_t)xavs2_blockcopy_ss_16x12_avx; + } + if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) { + pixf->copy_ss[LUMA_64x64] = (copy_ss_t)xavs2_blockcopy_ss_64x64_avx; + pixf->copy_ss[LUMA_64x32] = (copy_ss_t)xavs2_blockcopy_ss_64x32_avx; + pixf->copy_ss[LUMA_32x64] = (copy_ss_t)xavs2_blockcopy_ss_32x64_avx; + pixf->copy_ss[LUMA_64x16] = (copy_ss_t)xavs2_blockcopy_ss_64x16_avx; + pixf->copy_ss[LUMA_64x48] = (copy_ss_t)xavs2_blockcopy_ss_64x48_avx; + pixf->copy_ss[LUMA_16x64] = (copy_ss_t)xavs2_blockcopy_ss_16x64_avx; + pixf->copy_ss[LUMA_48x64] = (copy_ss_t)xavs2_blockcopy_ss_48x64_avx; + pixf->copy_ss[LUMA_32x32] = (copy_ss_t)xavs2_blockcopy_ss_32x32_avx; + pixf->copy_ss[LUMA_32x16] = (copy_ss_t)xavs2_blockcopy_ss_32x16_avx; + pixf->copy_ss[LUMA_16x32] = (copy_ss_t)xavs2_blockcopy_ss_16x32_avx; + pixf->copy_ss[LUMA_32x8 ] = (copy_ss_t)xavs2_blockcopy_ss_32x8_avx; + pixf->copy_ss[LUMA_32x24] = (copy_ss_t)xavs2_blockcopy_ss_32x24_avx; + pixf->copy_ss[LUMA_24x32] = (copy_ss_t)xavs2_blockcopy_ss_24x32_avx; + pixf->copy_ss[LUMA_16x16] = (copy_ss_t)xavs2_blockcopy_ss_16x16_avx; + pixf->copy_ss[LUMA_16x8 ] = (copy_ss_t)xavs2_blockcopy_ss_16x8_avx; + pixf->copy_ss[LUMA_16x4 ] = (copy_ss_t)xavs2_blockcopy_ss_16x4_avx; + pixf->copy_ss[LUMA_16x12] = (copy_ss_t)xavs2_blockcopy_ss_16x12_avx; + } +#else pixf->copy_pp[LUMA_64x64] = xavs2_blockcopy_pp_64x64_avx; pixf->copy_pp[LUMA_64x32] = xavs2_blockcopy_pp_64x32_avx; pixf->copy_pp[LUMA_32x64] = xavs2_blockcopy_pp_32x64_avx; @@ -992,9 +1767,14 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) pixf->copy_ss[LUMA_16x8 ] = xavs2_blockcopy_ss_16x8_avx; pixf->copy_ss[LUMA_16x4 ] = xavs2_blockcopy_ss_16x4_avx; pixf->copy_ss[LUMA_16x12] = xavs2_blockcopy_ss_16x12_avx; +#endif } +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { +#if HIGH_BIT_DEPTH + //10bit assemble +#else pixf->add_ps [LUMA_16x4 ] = xavs2_pixel_add_ps_16x4_avx2; pixf->add_ps [LUMA_16x8 ] = xavs2_pixel_add_ps_16x8_avx2; pixf->add_ps [LUMA_16x16] = xavs2_pixel_add_ps_16x16_avx2; @@ -1035,7 +1815,9 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) pixf->copy_ps[LUMA_32x32] = xavs2_blockcopy_ps_32x32_avx2; pixf->copy_ps[LUMA_16x32] = xavs2_blockcopy_ps_16x32_avx2; pixf->copy_ps[LUMA_16x16] = xavs2_blockcopy_ps_16x16_avx2; +#endif } +#endif #endif // if HAVE_MMX #undef ALL_LUMA_CU @@ -1047,8 +1829,9 @@ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) * pixel init * --------------------------------------------------------------------------- */ -void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) +void xavs2_pixel_init(xavs2_param_t* param, uint32_t cpuid, pixel_funcs_t* pixf) { +//#if !HIGH_BIT_DEPTH /* ------------------------------------------------------------- */ #define INIT_PIXEL_FUNC(name, cpu) \ @@ -1082,7 +1865,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->name[LUMA_4x8 ] = xavs2_pixel_ ## name ## _4x8 ## cpu;\ /* 4x4 */ \ pixf->name[LUMA_4x4 ] = xavs2_pixel_ ## name ## _4x4 ## cpu; - +//#endif /* ------------------------------------------------------------- */ @@ -1112,6 +1895,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->satd[LUMA_8x4 ] = xavs2_pixel_satd_8x4_ ## cpu;\ pixf->satd[LUMA_4x8 ] = xavs2_pixel_satd_4x8_ ## cpu; +//#if !HIGH_BIT_DEPTH /* ------------------------------------------------------------- */ #define INIT_SSD(cpu) \ @@ -1138,21 +1922,35 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) /* ------------------------------------------------------------- * init all c functions */ - INIT_PIXEL_FUNC(sad, ); // sad - INIT_PIXEL_FUNC(sad_x3, ); // sad_x3 - INIT_PIXEL_FUNC(sad_x4, ); // sad_x4 - INIT_PIXEL_FUNC(satd, ); // satd - INIT_PIXEL_FUNC(ssd, ); // ssd - INIT_PIXEL_FUNC(avg, ); // avg - INIT_PIXEL_FUNC(sa8d, ); // sa8d - - pixf->average = xavs2_pixel_average;// block average + if (param->input_sample_bit_depth == 8) { + INIT_PIXEL_FUNC(sad8, ); // sad + INIT_PIXEL_FUNC(sad8_x3, ); // sad_x3 + INIT_PIXEL_FUNC(sad8_x4, ); // sad_x4 + INIT_PIXEL_FUNC(satd8, ); // satd + INIT_PIXEL_FUNC(ssd8, ); // ssd + INIT_PIXEL_FUNC(avg8, ); // avg + INIT_PIXEL_FUNC(sa8d8, ); // sa8d + + pixf->average8 = xavs2_pixel_average8;// block average + } else { + INIT_PIXEL_FUNC(sad10, ); // sad + INIT_PIXEL_FUNC(sad10_x3, ); // sad_x3 + INIT_PIXEL_FUNC(sad10_x4, ); // sad_x4 + INIT_PIXEL_FUNC(satd10, ); // satd + INIT_PIXEL_FUNC(ssd10, ); // ssd + INIT_PIXEL_FUNC(avg10, ); // avg + INIT_PIXEL_FUNC(sa8d10, ); // sa8d + + pixf->average10 = xavs2_pixel_average10;// block average + } +//#endif /* ------------------------------------------------------------- * init SIMD functions */ #if HAVE_MMX if (cpuid & XAVS2_CPU_MMX2) { +#if !HIGH_BIT_DEPTH pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_mmx2; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_mmx2; pixf->sad [LUMA_8x16 ] = xavs2_pixel_sad_8x16_mmx2; @@ -1190,6 +1988,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->ssd [LUMA_8x4 ] = xavs2_pixel_ssd_8x4_mmx; pixf->ssd [LUMA_4x8 ] = xavs2_pixel_ssd_4x8_mmx; pixf->ssd [LUMA_4x4 ] = xavs2_pixel_ssd_4x4_mmx; +#endif pixf->satd [LUMA_16x16] = xavs2_pixel_satd_16x16_mmx2; pixf->satd [LUMA_16x8 ] = xavs2_pixel_satd_16x8_mmx2; @@ -1211,6 +2010,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) } if (cpuid & XAVS2_CPU_SSE2) { +#if !HIGH_BIT_DEPTH pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_sse2; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse2; pixf->sad [LUMA_16x12] = xavs2_pixel_sad_16x12_sse2; @@ -1232,6 +2032,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad [LUMA_48x64] = xavs2_pixel_sad_48x64_sse2; pixf->sad [LUMA_24x32] = xavs2_pixel_sad_24x32_sse2; pixf->sad [LUMA_12x16] = xavs2_pixel_sad_12x16_sse2; +#endif pixf->sa8d [LUMA_64x16] = xavs2_pixel_sa8d_64x16_sse2; pixf->sa8d [LUMA_64x32] = xavs2_pixel_sa8d_64x32_sse2; pixf->sa8d [LUMA_64x48] = xavs2_pixel_sa8d_64x48_sse2; @@ -1245,9 +2046,9 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_sse2; pixf->sa8d [LUMA_64x64] = xavs2_pixel_sa8d_64x64_sse2; - INIT_SATD(sse2); +#if !HIGH_BIT_DEPTH pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_sse2; pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_sse2; pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_sse2; @@ -1261,9 +2062,10 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad_x4[LUMA_8x4 ] = xavs2_pixel_sad_x4_8x4_sse2; INIT_SSD (sse2); - +#endif } +#if !HIGH_BIT_DEPTH if (cpuid & XAVS2_CPU_SSE3) { pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_sse3; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse3; @@ -1292,12 +2094,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_sse3; pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_sse3; pixf->sad_x4[LUMA_16x4 ] = xavs2_pixel_sad_x4_16x4_sse3; - } +#endif if (cpuid & XAVS2_CPU_SSSE3) { INIT_SATD(ssse3); +#if !HIGH_BIT_DEPTH pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_ssse3; /* 64x64 */ pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_ssse3; pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_ssse3; @@ -1337,6 +2140,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_ssse3; INIT_SSD (ssse3); +#endif pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_ssse3; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_ssse3; @@ -1345,11 +2149,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_ssse3; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_ssse3; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_ssse3; - } if (cpuid & XAVS2_CPU_SSE4) { INIT_SATD(sse4); +//#if !HIGH_BIT_DEPTH pixf->ssd [LUMA_12x16] = xavs2_pixel_ssd_12x16_sse4; pixf->ssd [LUMA_24x32] = xavs2_pixel_ssd_24x32_sse4; pixf->ssd [LUMA_48x64] = xavs2_pixel_ssd_48x64_sse4; @@ -1357,6 +2161,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->ssd [LUMA_64x32] = xavs2_pixel_ssd_64x32_sse4; pixf->ssd [LUMA_64x48] = xavs2_pixel_ssd_64x48_sse4; pixf->ssd [LUMA_64x64] = xavs2_pixel_ssd_64x64_sse4; +//#endif pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_sse4; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_sse4; @@ -1365,11 +2170,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_sse4; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_sse4; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_sse4; - } if (cpuid & XAVS2_CPU_AVX) { INIT_SATD(avx); +#if !HIGH_BIT_DEPTH pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_avx; /* 64x64 */ pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_avx; pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_avx; @@ -1409,6 +2214,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_avx; INIT_SSD (avx); +#endif pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_avx; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_avx; @@ -1418,16 +2224,18 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_avx; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_avx; pixf->sa8d [LUMA_64x64] = xavs2_pixel_sa8d_64x64_avx; - } +#if defined(__XOP__) if (cpuid & XAVS2_CPU_XOP) { INIT_SATD(xop); +#if !HIGH_BIT_DEPTH pixf->ssd [LUMA_16x16] = xavs2_pixel_ssd_16x16_xop; pixf->ssd [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_xop; pixf->ssd [LUMA_8x16 ] = xavs2_pixel_ssd_8x16_xop; pixf->ssd [LUMA_8x8 ] = xavs2_pixel_ssd_8x8_xop; pixf->ssd [LUMA_8x4 ] = xavs2_pixel_ssd_8x4_xop; +#endif //pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_xop; // in x265, this one is broken pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_xop; @@ -1436,11 +2244,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_xop; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_xop; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_xop; - } +#endif +#if defined(__AVX2__) #if ARCH_X86_64 if (cpuid & XAVS2_CPU_AVX2) { +#if !HIGH_BIT_DEPTH pixf->sad [LUMA_32x8 ] = xavs2_pixel_sad_32x8_avx2; pixf->sad [LUMA_32x16] = xavs2_pixel_sad_32x16_avx2; pixf->sad [LUMA_32x24] = xavs2_pixel_sad_32x24_avx2; @@ -1461,6 +2271,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->ssd [LUMA_32x32] = xavs2_pixel_ssd_32x32_avx2; pixf->ssd [LUMA_16x16] = xavs2_pixel_ssd_16x16_avx2; pixf->ssd [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_avx2; +#endif pixf->satd [LUMA_16x16] = xavs2_pixel_satd_16x16_avx2; pixf->satd [LUMA_16x8 ] = xavs2_pixel_satd_16x8_avx2; @@ -1480,6 +2291,7 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->satd [LUMA_16x4 ] = xavs2_pixel_satd_16x4_avx2; pixf->satd [LUMA_16x12] = xavs2_pixel_satd_16x12_avx2; +#if !HIGH_BIT_DEPTH pixf->sad_x3[LUMA_32x8 ] = xavs2_pixel_sad_x3_32x8_avx2; pixf->sad_x3[LUMA_32x16] = xavs2_pixel_sad_x3_32x16_avx2; pixf->sad_x3[LUMA_32x24] = xavs2_pixel_sad_x3_32x24_avx2; @@ -1505,11 +2317,13 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) pixf->sad_x4[LUMA_64x32] = xavs2_pixel_sad_x4_64x32_avx2; pixf->sad_x4[LUMA_64x48] = xavs2_pixel_sad_x4_64x48_avx2; pixf->sad_x4[LUMA_64x64] = xavs2_pixel_sad_x4_64x64_avx2; +#endif pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_avx2; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_avx2; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_avx2; } +#endif #endif /* ------------------------------------------------------------- @@ -1534,12 +2348,14 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) INIT_PIXEL_AVG(16, 8, mmx2); INIT_PIXEL_AVG(16, 4, mmx2); INIT_PIXEL_AVG(16, 12, mmx2); +#if !HIGH_BIT_DEPTH INIT_PIXEL_AVG( 8, 32, mmx2); INIT_PIXEL_AVG( 8, 16, mmx2); - INIT_PIXEL_AVG( 4, 16, mmx2); - INIT_PIXEL_AVG(12, 16, mmx2); INIT_PIXEL_AVG( 8, 8, mmx2); INIT_PIXEL_AVG( 8, 4, mmx2); +#endif + INIT_PIXEL_AVG( 4, 16, mmx2); + INIT_PIXEL_AVG(12, 16, mmx2); INIT_PIXEL_AVG( 4, 8, mmx2); INIT_PIXEL_AVG( 4, 4, mmx2); } @@ -1569,10 +2385,24 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) INIT_PIXEL_AVG( 8, 4, sse2); } +#if !HIGH_BIT_DEPTH if (cpuid & XAVS2_CPU_SSE3) { INIT_PIXEL_FUNC(avg, _ssse3); } +#endif + + /* block average */ + if (cpuid & XAVS2_CPU_SSE42) { + pixf->average = xavs2_pixel_average_sse128; + } +#if _MSC_VER + if (cpuid & XAVS2_CPU_AVX) { + pixf->average = xavs2_pixel_average_avx; + } +#endif + +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { #if ARCH_X86_64 INIT_PIXEL_AVG(64, 64, avx2); @@ -1592,20 +2422,11 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) INIT_PIXEL_AVG(16, 4, avx2); INIT_PIXEL_AVG(16, 12, avx2); } - - /* block average */ - if (cpuid & XAVS2_CPU_SSE42) { - pixf->average = xavs2_pixel_average_sse128; - } -#if _MSC_VER - if (cpuid & XAVS2_CPU_AVX2) { - pixf->average = xavs2_pixel_average_avx; - } #endif #endif /* init functions of block operation : copy/add/sub */ - init_block_opreation_funcs(cpuid, pixf); + init_block_opreation_funcs(param, cpuid, pixf); #undef INIT_PIXEL_AVG @@ -1617,9 +2438,9 @@ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) /* --------------------------------------------------------------------------- */ -static int mad_NxN_c(pel_t *p_src, int i_src, int cu_size) +static int mad8_NxN_c(pel8_t *p_src, int i_src, int cu_size) { - pel_t *p_src_base = p_src; + pel8_t *p_src_base = p_src; int num_pix = cu_size * cu_size; int x, y; int sum = 0; @@ -1648,23 +2469,71 @@ static int mad_NxN_c(pel_t *p_src, int i_src, int cu_size) return mad; } +static int mad10_NxN_c(pel10_t *p_src, int i_src, int cu_size) +{ + pel10_t *p_src_base = p_src; + int num_pix = cu_size * cu_size; + int x, y; + int sum = 0; + int f_avg = 0; /* average of all pixels in current block */ + int mad = 0; + + /* cal average */ + for (y = 0; y < cu_size; ++y) { + for (x = 0; x < cu_size; ++x) { + sum += p_src[x]; + } + p_src += i_src; + } + f_avg = (sum + (num_pix >> 1)) / num_pix; + + /* cal mad */ + p_src = p_src_base; + for (y = 0; y < cu_size; ++y) { + for (x = 0; x < cu_size; ++x) { + int f_pxl = p_src[x]; + mad += XAVS2_ABS(f_pxl - f_avg); + } + p_src += i_src; + } + + return mad; +} /* --------------------------------------------------------------------------- */ -void xavs2_mad_init(uint32_t cpuid, mad_funcs_t *madf) +void xavs2_mad8_init(uint32_t cpuid, mad_funcs8_t *madf8) { - madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; - madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; - madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; + madf8[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c; + madf8[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c; + madf8[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad8_NxN_c; /* init asm function handles */ #if HAVE_MMX /* functions defined in file intrinsic_mad.c */ if (cpuid & XAVS2_CPU_SSE2) { - madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128; - madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128; - madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128; + madf8[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128; + madf8[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128; + madf8[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128; } #endif //if HAVE_MMX } +void xavs2_mad10_init(uint32_t cpuid, mad_funcs10_t *madf10) +{ + madf10[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c; + madf10[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c; + madf10[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad10_NxN_c; + + /* init asm function handles */ +#if HAVE_MMX + /* functions defined in file intrinsic_mad.c */ + if (cpuid & XAVS2_CPU_SSE2) { + madf10[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128; + madf10[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128; + madf10[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128; + } +#endif //if HAVE_MMX +} + + diff --git a/source/common/pixel.h b/source/common/pixel.h index 645b591..2630772 100644 --- a/source/common/pixel.h +++ b/source/common/pixel.h @@ -121,48 +121,79 @@ enum ChromaCU { }; -typedef cmp_dist_t(*pixel_cmp_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2); -typedef dist_t(*pixel_ssd_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2); -typedef dist_t(*pixel_ssd2_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height); -typedef void(*pixel_cmp_x3_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2, intptr_t i_stride, int scores[3]); -typedef void(*pixel_cmp_x4_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2, const pel_t *pix3, intptr_t i_stride, int scores[4]); - -typedef void(*copy_pp_t)(pel_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); // dst is aligned -typedef void(*copy_sp_t)(pel_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); -typedef void(*copy_ps_t)(coeff_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); -typedef void(*copy_ss_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); - -typedef void(*pixel_sub_ps_t)(coeff_t* dst, intptr_t dstride, const pel_t* src0, const pel_t* src1, intptr_t sstride0, intptr_t sstride1); -typedef void(*pixel_add_ps_t)(pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1); -typedef void(*pixel_avg_pp_t)(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight); - -typedef int(*mad_funcs_t)(pel_t *p_src, int i_src, int cu_size); +typedef cmp_dist_t(*pixel8_cmp_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2); +typedef cmp_dist_t(*pixel10_cmp_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2); +typedef dist_t(*pixel8_ssd_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2); +typedef dist_t(*pixel10_ssd_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2); +typedef dist_t(*pixel8_ssd2_t)(const pel8_t *pix1, intptr_t i_pix1, const pel8_t *pix2, intptr_t i_pix2, int width, int height); +typedef dist_t(*pixel10_ssd2_t)(const pel10_t *pix1, intptr_t i_pix1, const pel10_t *pix2, intptr_t i_pix2, int width, int height); +typedef void(*pixel8_cmp_x3_t)(const pel8_t *fenc, const pel8_t *pix0, const pel8_t *pix1, const pel8_t *pix2, intptr_t i_stride, int scores[3]); +typedef void(*pixel10_cmp_x3_t)(const pel10_t *fenc, const pel10_t *pix0, const pel10_t *pix1, const pel10_t *pix2, intptr_t i_stride, int scores[3]); +typedef void(*pixel8_cmp_x4_t)(const pel8_t *fenc, const pel8_t *pix0, const pel8_t *pix1, const pel8_t *pix2, const pel8_t *pix3, intptr_t i_stride, int scores[4]); +typedef void(*pixel10_cmp_x4_t)(const pel10_t *fenc, const pel10_t *pix0, const pel10_t *pix1, const pel10_t *pix2, const pel10_t *pix3, intptr_t i_stride, int scores[4]); + +typedef void(*copy_pp8_t)(pel8_t* dst, intptr_t dstStride, const pel8_t* src, intptr_t srcStride); // dst is aligned +typedef void(*copy_pp10_t)(pel10_t* dst, intptr_t dstStride, const pel10_t* src, intptr_t srcStride); // dst is aligned +typedef void(*copy_sp8_t)(pel8_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); +typedef void(*copy_sp10_t)(pel10_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); +typedef void(*copy_ps8_t)(coeff_t* dst, intptr_t dstStride, const pel8_t* src, intptr_t srcStride); +typedef void(*copy_ps10_t)(coeff_t* dst, intptr_t dstStride, const pel10_t* src, intptr_t srcStride); +typedef void(*copy_ss8_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); +typedef void(*copy_ss10_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); + +typedef void(*pixel_sub_ps8_t)(coeff_t* dst, intptr_t dstride, const pel8_t* src0, const pel8_t* src1, intptr_t sstride0, intptr_t sstride1); +typedef void(*pixel_sub_ps10_t)(coeff_t* dst, intptr_t dstride, const pel10_t* src0, const pel10_t* src1, intptr_t sstride0, intptr_t sstride1); +typedef void(*pixel_add_ps8_t)(xavs2_t *h, pel8_t* a, intptr_t dstride, const pel8_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1); +typedef void(*pixel_add_ps10_t)(xavs2_t *h, pel10_t* a, intptr_t dstride, const pel10_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1); +typedef void(*pixel_avg_pp8_t)(pel8_t* dst, intptr_t dstride, const pel8_t* src0, intptr_t sstride0, const pel8_t* src1, intptr_t sstride1, int weight); +typedef void(*pixel_avg_pp10_t)(pel10_t* dst, intptr_t dstride, const pel10_t* src0, intptr_t sstride0, const pel10_t* src1, intptr_t sstride1, int weight); + +typedef int(*mad_funcs8_t)(pel8_t *p_src, int i_src, int cu_size); +typedef int(*mad_funcs10_t)(pel10_t *p_src, int i_src, int cu_size); typedef struct { - pixel_cmp_t sad [NUM_PU_SIZES]; - pixel_cmp_t satd [NUM_PU_SIZES]; - pixel_cmp_t sa8d [NUM_PU_SIZES]; - pixel_ssd_t ssd [NUM_PU_SIZES]; - pixel_cmp_x3_t sad_x3 [NUM_PU_SIZES]; - pixel_cmp_x4_t sad_x4 [NUM_PU_SIZES]; - - pixel_sub_ps_t sub_ps [NUM_PU_SIZES]; - pixel_add_ps_t add_ps [NUM_PU_SIZES]; - copy_sp_t copy_sp[NUM_PU_SIZES]; - copy_ps_t copy_ps[NUM_PU_SIZES]; - copy_ss_t copy_ss[NUM_PU_SIZES]; - copy_pp_t copy_pp[NUM_PU_SIZES]; - pixel_avg_pp_t avg [NUM_PU_SIZES]; - - pixel_cmp_t *intra_cmp; /* either satd or sad for intra mode prediction */ - pixel_cmp_t *fpel_cmp; /* either satd or sad for fractional pixel comparison in ME */ - - mad_funcs_t madf[CTU_DEPTH]; - - pixel_ssd2_t ssd_block; + pixel8_cmp_t sad8 [NUM_PU_SIZES]; + pixel10_cmp_t sad10 [NUM_PU_SIZES]; + pixel8_cmp_t satd8 [NUM_PU_SIZES]; + pixel10_cmp_t satd10 [NUM_PU_SIZES]; + pixel8_cmp_t sa8d8 [NUM_PU_SIZES]; + pixel10_cmp_t sa8d10 [NUM_PU_SIZES]; + pixel8_ssd_t ssd8 [NUM_PU_SIZES]; + pixel10_ssd_t ssd10 [NUM_PU_SIZES]; + pixel8_cmp_x3_t sad8_x3 [NUM_PU_SIZES]; + pixel10_cmp_x3_t sad10_x3 [NUM_PU_SIZES]; + pixel8_cmp_x4_t sad8_x4 [NUM_PU_SIZES]; + pixel10_cmp_x4_t sad10_x4 [NUM_PU_SIZES]; + + pixel_sub_ps8_t sub_ps8 [NUM_PU_SIZES]; + pixel_sub_ps10_t sub_ps10 [NUM_PU_SIZES]; + pixel_add_ps8_t add_ps8 [NUM_PU_SIZES]; + pixel_add_ps10_t add_ps10 [NUM_PU_SIZES]; + copy_sp8_t copy_sp8[NUM_PU_SIZES]; + copy_sp10_t copy_sp10[NUM_PU_SIZES]; + copy_ps8_t copy_ps8[NUM_PU_SIZES]; + copy_ps10_t copy_ps10[NUM_PU_SIZES]; + copy_ss8_t copy_ss8[NUM_PU_SIZES]; + copy_ss10_t copy_ss10[NUM_PU_SIZES]; + copy_pp8_t copy_pp8[NUM_PU_SIZES]; + copy_pp10_t copy_pp10[NUM_PU_SIZES]; + pixel_avg_pp8_t avg8 [NUM_PU_SIZES]; + pixel_avg_pp10_t avg10 [NUM_PU_SIZES]; + + pixel8_cmp_t *intra8_cmp; /* either satd or sad for intra mode prediction */ + pixel10_cmp_t *intra10_cmp; /* either satd or sad for intra mode prediction */ + pixel8_cmp_t *fpel8_cmp; /* either satd or sad for fractional pixel comparison in ME */ + pixel10_cmp_t *fpel10_cmp; /* either satd or sad for fractional pixel comparison in ME */ + + mad_funcs8_t madf8[CTU_DEPTH]; + mad_funcs10_t madf10[CTU_DEPTH]; + + pixel8_ssd2_t ssd_block8; + pixel10_ssd2_t ssd_block10; /* block average */ - void (*average)(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); + void (*average8)(pel8_t *dst, int i_dst, pel8_t *src1, int i_src1, pel8_t *src2, int i_src2, int width, int height); + void (*average10)(pel10_t *dst, int i_dst, pel10_t *src1, int i_src1, pel10_t *src2, int i_src2, int width, int height); } pixel_funcs_t; @@ -185,17 +216,26 @@ extern const uint8_t g_partition_map_tab[]; */ #define xavs2_pixel_init FPFX(pixel_init) -void xavs2_pixel_init(uint32_t cpu, pixel_funcs_t* pixf); +void xavs2_pixel_init(xavs2_param_t* param, uint32_t cpu, pixel_funcs_t* pixf); -#define xavs2_pixel_ssd_wxh FPFX(xpixel_ssd_wxh) -uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, - pel_t *p_pix1, intptr_t i_pix1, - pel_t *p_pix2, intptr_t i_pix2, +#define xavs2_pixel_ssd8_wxh FPFX(xpixel_ssd8_wxh) +uint64_t xavs2_pixel_ssd8_wxh(pixel_funcs_t *pf, + pel8_t *p_pix1, intptr_t i_pix1, + pel8_t *p_pix2, intptr_t i_pix2, int i_width, int i_height, int inout_shift); +#define xavs2_pixel_ssd10_wxh FPFX(xpixel_ssd10_wxh) +uint64_t xavs2_pixel_ssd10_wxh(pixel_funcs_t *pf, + pel10_t *p_pix1, intptr_t i_pix1, + pel10_t *p_pix2, intptr_t i_pix2, + int i_width, int i_height, + int inout_shift); + +#define xavs2_mad8_init FPFX(mad8_init) +void xavs2_mad8_init(uint32_t cpu, mad_funcs8_t *madf8); -#define xavs2_mad_init FPFX(mad_init) -void xavs2_mad_init(uint32_t cpu, mad_funcs_t *madf); +#define xavs2_mad10_init FPFX(mad10_init) +void xavs2_mad10_init(uint32_t cpu, mad_funcs10_t *madf10); #endif // XAVS2_PIXEL_H diff --git a/source/common/primitives.c b/source/common/primitives.c index 07e4251..2656104 100644 --- a/source/common/primitives.c +++ b/source/common/primitives.c @@ -55,26 +55,29 @@ void xavs2_init_all_primitives(xavs2_param_t* param, intrinsic_func_t *p_funcs) uint32_t cpuid = p_funcs->cpuid; if (param != NULL) { - if (param->sample_bit_depth != g_bit_depth) { - xavs2_log(NULL, XAVS2_LOG_ERROR, "init primitives error: only %d bit-depth is supported\n", g_bit_depth); + if (param->sample_bit_depth != param->input_sample_bit_depth) { + xavs2_log(NULL, XAVS2_LOG_ERROR, "init primitives error: only %d bit-depth is supported\n", param->input_sample_bit_depth); } } /* init memory operation function handlers */ - xavs2_mem_oper_init (cpuid, p_funcs); + xavs2_mem_oper_init (param, cpuid, p_funcs); /* init function handles */ - xavs2_intra_pred_init(cpuid, p_funcs); - xavs2_mc_init (cpuid, p_funcs); - xavs2_pixel_init (cpuid, &p_funcs->pixf); - xavs2_deblock_init (cpuid, p_funcs); + xavs2_intra_pred_init(param, cpuid, p_funcs); + xavs2_mc_init (param, cpuid, p_funcs); + xavs2_pixel_init (param, cpuid, &p_funcs->pixf); + xavs2_deblock_init (param, cpuid, p_funcs); xavs2_dct_init (cpuid, &p_funcs->dctf); xavs2_quant_init (cpuid, &p_funcs->dctf); xavs2_cg_scan_init (cpuid, p_funcs); - xavs2_mad_init (cpuid, p_funcs->pixf.madf); - - xavs2_sao_init (cpuid, p_funcs); - xavs2_alf_init (cpuid, p_funcs); + if (param->input_sample_bit_depth == 8) { + xavs2_mad8_init (cpuid, p_funcs->pixf.madf8); + } else { + xavs2_mad10_init (cpuid, p_funcs->pixf.madf10); + } + xavs2_sao_init (param, cpuid, p_funcs); + xavs2_alf_init (param, cpuid, p_funcs); xavs2_rdo_init (cpuid, p_funcs); } diff --git a/source/common/primitives.h b/source/common/primitives.h index 7c690fe..0847cc1 100644 --- a/source/common/primitives.h +++ b/source/common/primitives.h @@ -57,39 +57,57 @@ typedef void *(*memcpy_t)(void *dst, const void *src, size_t n); /* --------------------------------------------------------------------------- * inter prediction */ -typedef void(*block_copy_t )(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); -typedef void(*plane_copy_di_t)(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h); -typedef void(*intpl_t )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); -typedef void(*intpl_ext_t )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y); - -typedef void(*intpl_luma_hor_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff); -typedef void(*intpl_luma_ext_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); -typedef void(*intpl_luma_ver_t)(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff); - -typedef void(*intpl_luma_ver_x3_t)(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff); -typedef void(*intpl_luma_hor_x3_t)(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); -typedef void(*intpl_luma_ext_x3_t)(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); - -typedef void (*filter_pp_t) (const pel_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_hps_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); -typedef void (*filter_ps_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_sp_t) (const int16_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int coeffIdx); +typedef void(*block_copy8_t )(xavs2_t *bb, pel8_t *dst, intptr_t i_dst, pel8_t *src, intptr_t i_src, int w, int h); +typedef void(*block_copy10_t )(xavs2_t *bb, pel10_t *dst, intptr_t i_dst, pel10_t *src, intptr_t i_src, int w, int h); +typedef void(*plane_copy8_di_t)(xavs2_t *bb, pel8_t *dstu, intptr_t i_dstu, pel8_t *dstv, intptr_t i_dstv, pel8_t *src, intptr_t i_src, int w, int h); +typedef void(*plane_copy10_di_t)(xavs2_t *bb, pel10_t *dstu, intptr_t i_dstu, pel10_t *dstv, intptr_t i_dstv, pel10_t *src, intptr_t i_src, int w, int h); +typedef void(*intpl8_t )(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff); +typedef void(*intpl10_t )(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff); +typedef void(*intpl8_ext_t )(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y); +typedef void(*intpl10_ext_t )(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y); + +typedef void(*intpl_luma8_hor_t)(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t *coeff); +typedef void(*intpl_luma10_hor_t)(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t *coeff); +typedef void(*intpl_luma8_ext_t)(xavs2_t *h, pel8_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); +typedef void(*intpl_luma10_ext_t)(xavs2_t *h, pel10_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); +typedef void(*intpl_luma8_ver_t)(xavs2_t *h, pel8_t *dst, int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const *coeff); +typedef void(*intpl_luma10_ver_t)(xavs2_t *h, pel10_t *dst, int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const *coeff); + +typedef void(*intpl_luma8_ver_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, pel8_t *src, int i_src, int width, int height, int8_t const **coeff); +typedef void(*intpl_luma10_ver_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, pel10_t *src, int i_src, int width, int height, int8_t const **coeff); +typedef void(*intpl_luma8_hor_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel8_t *src, int i_src, int width, int height, const int8_t **coeff); +typedef void(*intpl_luma10_hor_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel10_t *src, int i_src, int width, int height, const int8_t **coeff); +typedef void(*intpl_luma8_ext_x3_t)(xavs2_t *h, pel8_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); +typedef void(*intpl_luma10_ext_x3_t)(xavs2_t *h, pel10_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); + +typedef void (*filter_pp8_t) (const pel8_t *src, intptr_t srcStride, pel8_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_pp10_t) (const pel10_t *src, intptr_t srcStride, pel10_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_hps8_t) (const pel8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); +typedef void (*filter_hps10_t) (const pel10_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); +typedef void (*filter_ps8_t) (const pel8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_ps10_t) (const pel10_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_sp8_t) (const int16_t *src, intptr_t srcStride, pel8_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_sp10_t) (const int16_t *src, intptr_t srcStride, pel10_t *dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_ss_t) (const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); -typedef void (*filter_hv_pp_t) (const pel_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int idxX, int idxY); -typedef void (*filter_p2s_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride); +typedef void (*filter_hv_pp8_t) (const pel8_t *src, intptr_t srcStride, pel8_t *dst, intptr_t dstStride, int idxX, int idxY); +typedef void (*filter_hv_pp10_t) (const pel10_t *src, intptr_t srcStride, pel10_t *dst, intptr_t dstStride, int idxX, int idxY); +typedef void (*filter_p2s8_t) (const pel8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride); +typedef void (*filter_p2s10_t) (const pel10_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride); /* --------------------------------------------------------------------------- * intra prediction */ -typedef void(*intra_pred_t)(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); -typedef void(*fill_edge_t) (const pel_t *p_topleft, int i_topleft, const pel_t *p_lcu_ep, pel_t *ep, uint32_t i_avail, int bsx, int bsy); +typedef void(*intra8_pred_t)(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +typedef void(*intra10_pred_t)(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +typedef void(*fill_edge8_t) (xavs2_t *h, const pel8_t *p_topleft, int i_topleft, const pel8_t *p_lcu_ep, pel8_t *ep, uint32_t i_avail, int bsx, int bsy); +typedef void(*fill_edge10_t) (xavs2_t *h, const pel10_t *p_topleft, int i_topleft, const pel10_t *p_lcu_ep, pel10_t *ep, uint32_t i_avail, int bsx, int bsy); typedef void(*fill_ref_samples_t)(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy); /* --------------------------------------------------------------------------- * transform and quantization functions */ -typedef void(*dct_t)(const coeff_t *src, coeff_t *dst, int i_src); +typedef void(*dct_t)(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); /* --------------------------------------------------------------------------- * coefficient scan @@ -102,11 +120,11 @@ typedef struct { /* dct */ dct_t dct[NUM_PU_SIZES]; dct_t idct[NUM_PU_SIZES]; - dct_t dct_half[NUM_PU_SIZES]; // 只求解DCT矩阵的低频系数 + dct_t dct_half[NUM_PU_SIZES]; // 鍙眰瑙CT鐭╅樀鐨勪綆棰戠郴鏁 /* 2nd transform */ - void(*transform_4x4_2nd) (coeff_t *coeff, int i_coeff); - void(*inv_transform_4x4_2nd)(coeff_t *coeff, int i_coeff); + void(*transform_4x4_2nd) (xavs2_t *h, coeff_t *coeff, int i_coeff); + void(*inv_transform_4x4_2nd)(xavs2_t *h, coeff_t *coeff, int i_coeff); void(*transform_2nd) (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); void(*inv_transform_2nd) (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); @@ -120,11 +138,12 @@ typedef struct { /* SAO filter function */ -typedef void(*sao_flt_t)(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +typedef void(*sao_flt8_t)(xavs2_t* h,pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, + int i_block_w, int i_block_h, + int *lcu_avail, SAOBlkParam *sao_param); +typedef void(*sao_flt10_t)(xavs2_t* h,pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param); - - /* --------------------------------------------------------------------------- */ @@ -137,7 +156,8 @@ typedef struct intrinsic_func_t { void*(*fast_memset)(void *dst, int val, size_t n); void (*mem_repeat_i)(void *dst, int val, size_t count); void*(*mem_repeat_p)(void *dst, int val, size_t count); - void (*lowres_filter)(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); + void (*lowres_filter8)(xavs2_t *h, pel8_t *src, int i_src, pel8_t *dst, int i_dst, int width, int height); + void (*lowres_filter10)(xavs2_t *h, pel10_t *src, int i_src, pel10_t *dst, int i_dst, int width, int height); pixel_funcs_t pixf; @@ -145,46 +165,73 @@ typedef struct intrinsic_func_t { * block copy */ /* align copy */ - block_copy_t align_copy; + block_copy8_t align_copy8; + block_copy10_t align_copy10; /* plane copy */ - block_copy_t plane_copy; - plane_copy_di_t plane_copy_deinterleave; + block_copy8_t plane_copy8; + block_copy10_t plane_copy10; + plane_copy8_di_t plane_copy8_deinterleave; + plane_copy10_di_t plane_copy10_deinterleave; /* --------------------------------------------------------------------------- * Motion Compensation */ - intpl_luma_hor_t intpl_luma_hor; - intpl_luma_ver_t intpl_luma_ver; - intpl_luma_ext_t intpl_luma_ext; - - intpl_luma_ver_x3_t intpl_luma_ver_x3; - intpl_luma_hor_x3_t intpl_luma_hor_x3; - intpl_luma_ext_x3_t intpl_luma_ext_x3; - - intpl_t intpl_luma_block_hor; - intpl_t intpl_luma_block_ver; - intpl_ext_t intpl_luma_block_ext; - - intpl_t intpl_chroma_block_hor; - intpl_t intpl_chroma_block_ver; - intpl_ext_t intpl_chroma_block_ext; - - struct inter_pred_t { - filter_pp_t luma_hpp; // 8-tap luma motion compensation interpolation filters - filter_hps_t luma_hps; - filter_pp_t luma_vpp; - filter_ps_t luma_vps; - filter_sp_t luma_vsp; - filter_ss_t luma_vss; - filter_hv_pp_t luma_hvpp; // combines hps + vsp - } intpl[NUM_PU_SIZES]; + intpl_luma8_hor_t intpl_luma8_hor; + intpl_luma10_hor_t intpl_luma10_hor; + intpl_luma8_ver_t intpl_luma8_ver; + intpl_luma10_ver_t intpl_luma10_ver; + intpl_luma8_ext_t intpl_luma8_ext; + intpl_luma10_ext_t intpl_luma10_ext; + + intpl_luma8_ver_x3_t intpl_luma8_ver_x3; + intpl_luma10_ver_x3_t intpl_luma10_ver_x3; + intpl_luma8_hor_x3_t intpl_luma8_hor_x3; + intpl_luma10_hor_x3_t intpl_luma10_hor_x3; + intpl_luma8_ext_x3_t intpl_luma8_ext_x3; + intpl_luma10_ext_x3_t intpl_luma10_ext_x3; + + intpl8_t intpl_luma8_block_hor; + intpl10_t intpl_luma10_block_hor; + intpl8_t intpl_luma8_block_ver; + intpl10_t intpl_luma10_block_ver; + intpl8_ext_t intpl_luma8_block_ext; + intpl10_ext_t intpl_luma10_block_ext; + + intpl8_t intpl_chroma8_block_hor; + intpl10_t intpl_chroma10_block_hor; + intpl8_t intpl_chroma8_block_ver; + intpl10_t intpl_chroma10_block_ver; + intpl8_ext_t intpl_chroma8_block_ext; + intpl10_ext_t intpl_chroma10_block_ext; + + struct inter_pred8_t { + filter_pp8_t luma_hpp8; // 8-tap luma motion compensation interpolation filters + filter_hps8_t luma_hps8; + filter_pp8_t luma_vpp8; + filter_ps8_t luma_vps8; + filter_sp8_t luma_vsp8; + filter_ss_t luma_vss8; + filter_hv_pp8_t luma_hvpp8; // combines hps + vsp + } intpl8[NUM_PU_SIZES]; + + struct inter_pred10_t { + filter_pp10_t luma_hpp10; // 8-tap luma motion compensation interpolation filters + filter_hps10_t luma_hps10; + filter_pp10_t luma_vpp10; + filter_ps10_t luma_vps10; + filter_sp10_t luma_vsp10; + filter_ss_t luma_vss10; + filter_hv_pp10_t luma_hvpp10; // combines hps + vsp + } intpl10[NUM_PU_SIZES]; /* --------------------------------------------------------------------------- * intra prediction */ - intra_pred_t intraf[NUM_INTRA_MODE]; - fill_edge_t fill_edge_f[4]; /* 0, x, y, xy */ + intra8_pred_t intraf8[NUM_INTRA_MODE]; + intra10_pred_t intraf10[NUM_INTRA_MODE]; + fill_edge8_t fill_edge8_f[4]; /* 0, x, y, xy */ + fill_edge10_t fill_edge10_f[4]; /* 0, x, y, xy */ fill_ref_samples_t fill_ref_luma[2]; /* 0: CU inside picture; 1: on right/bottom */ /* --------------------------------------------------------------------------- @@ -199,16 +246,25 @@ typedef struct intrinsic_func_t { /* --------------------------------------------------------------------------- * In-loop filter */ - void(*deblock_luma[2])(pel_t *, int, int, int, uint8_t*); - void(*deblock_chroma[2])(pel_t *, pel_t *, int, int, int, uint8_t*); + void(*deblock_luma8[2])(xavs2_t *, pel8_t *, int, int, int, uint8_t*); + void(*deblock_chroma8[2])(xavs2_t *, pel8_t *, pel8_t *, int, int, int, uint8_t*); + void(*deblock_luma10[2])(xavs2_t *, pel10_t *, int, int, int, uint8_t*); + void(*deblock_chroma10[2])(xavs2_t *, pel10_t *, pel10_t *, int, int, int, uint8_t*); - void(*deblock_luma_double[2]) (pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag); - void(*deblock_chroma_double[2])(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag); + void(*deblock_luma8_double[2]) (pel8_t *src, int stride, int alpha, int beta, uint8_t *flt_flag); + void(*deblock_chroma8_double[2])(pel8_t *src_u, pel8_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag); + void(*deblock_luma10_double[2]) (pel10_t *src, int stride, int alpha, int beta, uint8_t *flt_flag); + void(*deblock_chroma10_double[2])(pel10_t *src_u, pel10_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag); - sao_flt_t sao_block; /* filter for SAO */ + sao_flt8_t sao_block8; /* filter for SAO */ + sao_flt10_t sao_block10; /* filter for SAO */ /* function handles */ - void(*alf_flt[2])(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, + void(*alf_flt8[2])(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int *alf_coeff, int b_top_avail, int b_down_avail); + + void(*alf_flt10[2])(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail); @@ -228,16 +284,17 @@ extern intrinsic_func_t g_funcs; * =========================================================================== */ #define xavs2_mem_oper_init FPFX(mem_oper_init) -void xavs2_mem_oper_init (uint32_t cpuid, intrinsic_func_t *pf); +void xavs2_mem_oper_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_mc_init FPFX(mc_init) -void xavs2_mc_init (uint32_t cpuid, intrinsic_func_t *pf); +void xavs2_mc_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_intra_pred_init FPFX(intra_pred_init) -void xavs2_intra_pred_init (uint32_t cpuid, intrinsic_func_t *pf); +void xavs2_intra_pred_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_dct_init FPFX(dct_init) void xavs2_dct_init (uint32_t cpuid, dct_funcs_t *dctf); + #define xavs2_quant_init FPFX(quant_init) void xavs2_quant_init (uint32_t cpuid, dct_funcs_t *quantf); @@ -245,12 +302,13 @@ void xavs2_quant_init (uint32_t cpuid, dct_funcs_t *quantf); void xavs2_cg_scan_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_deblock_init FPFX(deblock_init) -void xavs2_deblock_init (uint32_t cpuid, intrinsic_func_t* lf); +void xavs2_deblock_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t* lf); #define xavs2_sao_init FPFX(sao_init) -void xavs2_sao_init (uint32_t cpuid, intrinsic_func_t *pf); +void xavs2_sao_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf); + #define xavs2_alf_init FPFX(alf_init) -void xavs2_alf_init (uint32_t cpuid, intrinsic_func_t *pf); +void xavs2_alf_init (xavs2_param_t* param, uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_rdo_init FPFX(rdo_init) void xavs2_rdo_init (uint32_t cpuid, intrinsic_func_t *pf); diff --git a/source/common/quant.c b/source/common/quant.c index f88f004..5eb3ae5 100644 --- a/source/common/quant.c +++ b/source/common/quant.c @@ -228,6 +228,7 @@ void xavs2_quant_init(uint32_t cpuid, dct_funcs_t *dctf) dctf->add_sign = add_sign_sse128; } +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { dctf->quant = quant_c_avx2; dctf->dequant = dequant_c_avx2; @@ -241,6 +242,7 @@ void xavs2_quant_init(uint32_t cpuid, dct_funcs_t *dctf) dctf->dequant = FPFX(dequant_avx2); #endif } +#endif #else UNUSED_PARAMETER(cpuid); #endif // if HAVE_MMX diff --git a/source/common/threadpool.c b/source/common/threadpool.c index 34bd8f7..9fe742d 100644 --- a/source/common/threadpool.c +++ b/source/common/threadpool.c @@ -300,7 +300,7 @@ int xavs2_threadpool_init(xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_ if (xavs2_sync_job_list_init(&pool->uninit, pool->i_threads) || xavs2_sync_job_list_init(&pool->run, pool->i_threads) || xavs2_sync_job_list_init(&pool->done, pool->i_threads)) { - goto fail; + goto fail8; } for (i = 0; i < pool->i_threads; i++) { @@ -313,13 +313,13 @@ int xavs2_threadpool_init(xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_ for (i = 0; i < pool->i_threads; i++) { if (xavs2_create_thread(pool->thread_handle + i, (xavs2_tfunc_t)proc_xavs2_threadpool_thread, pool)) { - goto fail; + goto fail8; } } return 0; -fail: +fail8: return -1; } diff --git a/source/common/transform.c b/source/common/transform.c index 250545d..f282115 100644 --- a/source/common/transform.c +++ b/source/common/transform.c @@ -1061,9 +1061,9 @@ static void xTr2nd_4_1d_Inv_Hor(coeff_t *coeff, int i_coeff, int i_shift, int cl /* --------------------------------------------------------------------------- */ -static void transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) +static void transform_4x4_2nd_c(xavs2_t *h, coeff_t *coeff, int i_coeff) { - const int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1; + const int shift1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + 1; const int shift2 = B4X4_IN_BIT + FACTO_BIT + 1; xTr2nd_4_1d_Hor(coeff, i_coeff, shift1, g_2T_C); @@ -1072,11 +1072,11 @@ static void transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) /* --------------------------------------------------------------------------- */ -static void inv_transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) +static void inv_transform_4x4_2nd_c(xavs2_t *h, coeff_t *coeff, int i_coeff) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth + 2; - const int clip_depth2 = g_bit_depth + 1; + const int shift2 = 20 - h->param->input_sample_bit_depth + 2; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, shift1, g_2T_C); xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, shift2, clip_depth2, g_2T_C); @@ -1120,12 +1120,12 @@ static void inv_transform_2nd_c(coeff_t *coeff, int i_coeff, int i_mode, int b_t /* --------------------------------------------------------------------------- */ -static void dct_4x4_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_4x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 4 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); - int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; // 0 + int shift1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; // 0 int shift2 = B4X4_IN_BIT + FACTO_BIT; // 7 int i; @@ -1142,15 +1142,15 @@ static void dct_4x4_c(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_4x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 4 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; - int shift2 = 20 - g_bit_depth; + int shift2 = 20 - h->param->input_sample_bit_depth; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1; + int clip_depth2 = h->param->input_sample_bit_depth + 1; int i; partialButterflyInverse4(src, coeff, shift1, BSIZE, clip_depth1); @@ -1164,12 +1164,12 @@ static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -static void dct_8x8_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_8x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 8 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); - int shift1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + int shift1 = B8X8_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; int shift2 = B8X8_IN_BIT + FACTO_BIT; int i; @@ -1184,15 +1184,15 @@ static void dct_8x8_c(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_8x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 8 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; - int shift2 = 20 - g_bit_depth; + int shift2 = 20 - h->param->input_sample_bit_depth; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1; + int clip_depth2 = h->param->input_sample_bit_depth + 1; int i; partialButterflyInverse8(src, coeff, shift1, BSIZE, clip_depth1); @@ -1206,12 +1206,12 @@ static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -static void dct_16x16_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_16x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 16 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); - int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; int shift2 = B16X16_IN_BIT + FACTO_BIT; int i; @@ -1226,15 +1226,15 @@ static void dct_16x16_c(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_16x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 16 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; - int shift2 = 20 - g_bit_depth; + int shift2 = 20 - h->param->input_sample_bit_depth; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1; + int clip_depth2 = h->param->input_sample_bit_depth + 1; int i; partialButterflyInverse16(src, coeff, shift1, BSIZE, clip_depth1); @@ -1250,12 +1250,12 @@ static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_32x32_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_32x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 32 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); - int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); + int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; int i; @@ -1273,10 +1273,10 @@ static void dct_32x32_c(const coeff_t *src, coeff_t *dst, int i_src) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_32x32_half_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_32x32_half_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { int i; - dct_32x32_c(src, dst, i_src); + dct_32x32_c(h, src, dst, i_src); for (i = 0; i < 16; i++) { memset(dst + 16, 0, 16 * sizeof(coeff_t)); @@ -1289,16 +1289,16 @@ static void dct_32x32_half_c(const coeff_t *src, coeff_t *dst, int i_src) * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_32x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 32 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int a_flag = i_dst & 0x01; int shift1 = 5; - int shift2 = 20 - g_bit_depth - a_flag; + int shift2 = 20 - h->param->input_sample_bit_depth - a_flag; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + a_flag; + int clip_depth2 = h->param->input_sample_bit_depth + 1 + a_flag; int i; i_dst &= 0xFE; /* remember to remove the flag bit */ @@ -1313,13 +1313,13 @@ static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -static void dct_16x4_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_16x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 16 #define BSIZE_V 4 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); - int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; int shift2 = B16X16_IN_BIT + FACTO_BIT - 2; int i; @@ -1335,16 +1335,16 @@ static void dct_16x4_c(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_16x4_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 16 #define BSIZE_V 4 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; - int shift2 = 20 - g_bit_depth; + int shift2 = 20 - h->param->input_sample_bit_depth; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1; + int clip_depth2 = h->param->input_sample_bit_depth + 1; int i; partialButterflyInverse4 (src, coeff, shift1, BSIZE_H, clip_depth1); @@ -1359,13 +1359,13 @@ static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -static void dct_4x16_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_4x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 4 #define BSIZE_V 16 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); - int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2; + int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2; int shift2 = B16X16_IN_BIT + FACTO_BIT; int i; @@ -1381,16 +1381,16 @@ static void dct_4x16_c(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_4x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 4 #define BSIZE_V 16 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; - int shift2 = 20 - g_bit_depth; + int shift2 = 20 - h->param->input_sample_bit_depth; int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1; + int clip_depth2 = h->param->input_sample_bit_depth + 1; int i; partialButterflyInverse16(src, coeff, shift1, BSIZE_H, clip_depth1); @@ -1407,13 +1407,13 @@ static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_32x8_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_32x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 32 #define BSIZE_V 8 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); - int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01); int i; @@ -1432,16 +1432,16 @@ static void dct_32x8_c(const coeff_t *src, coeff_t *dst, int i_src) * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_32x8_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 32 #define BSIZE_V 8 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; - int shift2 = 20 - g_bit_depth - (i_dst & 0x01); + int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); + int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse8 (src, coeff, shift1, BSIZE_H, clip_depth1); @@ -1459,13 +1459,13 @@ static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_8x32_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_8x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 8 #define BSIZE_V 32 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); - int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); + int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; int i; @@ -1484,16 +1484,16 @@ static void dct_8x32_c(const coeff_t *src, coeff_t *dst, int i_src) * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_8x32_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 8 #define BSIZE_V 32 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; - int shift2 = 20 - g_bit_depth - (i_dst & 0x01); + int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); + int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse32(src, coeff, shift1, BSIZE_H, clip_depth1); @@ -1511,22 +1511,22 @@ static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_64x64_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_64x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x64_c(src, dst); - dct_32x32_c(dst, dst, 32 | 0x01); /* 32x32 dct */ + dct_32x32_c(h, dst, dst, 32 | 0x01); /* 32x32 dct */ } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_64x64_half_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_64x64_half_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x64_c(src, dst); - dct_32x32_half_c(dst, dst, 32 | 0x01); /* 32x32 dct */ + dct_32x32_half_c(h, dst, dst, 32 | 0x01); /* 32x32 dct */ } @@ -1534,10 +1534,10 @@ static void dct_64x64_half_c(const coeff_t *src, coeff_t *dst, int i_src) * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_64x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_32x32_c(src, dst, 32 | 0x01); /* 32x32 idct */ + idct_32x32_c(h, src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_c(dst); } @@ -1545,21 +1545,21 @@ static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_64x16_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_64x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x16_c(src, dst); - dct_32x8_c(dst, dst, 32 | 0x01); + dct_32x8_c(h, dst, dst, 32 | 0x01); } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_64x16_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_32x8_c(src, dst, 32 | 0x01); + idct_32x8_c(h, src, dst, 32 | 0x01); inv_wavelet_64x16_c(dst); } @@ -1568,21 +1568,21 @@ static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst) * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ -static void dct_16x64_c(const coeff_t *src, coeff_t *dst, int i_src) +static void dct_16x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_16x64_c(src, dst); - dct_8x32_c(dst, dst, 8 | 0x01); + dct_8x32_c(h, dst, dst, 8 | 0x01); } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ -static void idct_16x64_c(const coeff_t *src, coeff_t *dst, int i_dst) +static void idct_16x64_c(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_8x32_c(src, dst, 8 | 0x01); + idct_8x32_c(h, src, dst, 8 | 0x01); inv_wavelet_16x64_c(dst); } @@ -1658,7 +1658,7 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf) /* dct: asymmetrical */ dctf->dct[LUMA_16x4 ] = dct_c_16x4_sse128; - dctf->dct[LUMA_4x16 ] = dct_c_4x16_sse128;//第一次变换没写移位 + dctf->dct[LUMA_4x16 ] = dct_c_4x16_sse128;//绗竴娆″彉鎹㈡病鍐欑Щ浣 dctf->dct[LUMA_32x8 ] = dct_c_32x8_sse128; dctf->dct[LUMA_8x32 ] = dct_c_8x32_sse128; dctf->dct[LUMA_64x16] = dct_c_64x16_sse128; @@ -1708,8 +1708,8 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf) dctf->dct[LUMA_8x8 ] = xavs2_dct_8x8_sse4; } +#if defined(__AVX2__) if (cpuid & XAVS2_CPU_AVX2) { - dctf->dct [LUMA_4x4 ] = xavs2_dct_4x4_avx2; #if ARCH_X86_64 dctf->dct [LUMA_8x8 ] = xavs2_dct_8x8_avx2; @@ -1723,13 +1723,12 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf) #endif } - #if ARCH_X86_64 if (cpuid & XAVS2_CPU_AVX2) { - // dctf->dct[LUMA_4x4 ] = dct_c_4x4_avx2; /* futl: dct_4x4_avx2的速度比dct_4x4_sse128略慢一点 */ - // dctf->dct[LUMA_8x8 ] = dct_c_8x8_avx2; /* futl: dct_8x8_avx2的速度比xavs2_dct_8x8_avx2慢 */ - // dctf->dct[LUMA_4x16] = dct_c_4x16_avx2; /* futl: dct_4x16_avx2的速度比dct_4x16_sse128慢 */ - dctf->dct[LUMA_16x4 ] = dct_c_16x4_avx2; /* 姜波:速度比sse128快两倍 */ + // dctf->dct[LUMA_4x4 ] = dct_c_4x4_avx2; /* futl: dct_4x4_avx2鐨勯熷害姣攄ct_4x4_sse128鐣ユ參涓鐐 */ + // dctf->dct[LUMA_8x8 ] = dct_c_8x8_avx2; /* futl: dct_8x8_avx2鐨勯熷害姣攛avs2_dct_8x8_avx2鎱 */ + // dctf->dct[LUMA_4x16] = dct_c_4x16_avx2; /* futl: dct_4x16_avx2鐨勯熷害姣攄ct_4x16_sse128鎱 */ + dctf->dct[LUMA_16x4 ] = dct_c_16x4_avx2; /* 濮滄尝锛氶熷害姣攕se128蹇袱鍊 */ dctf->dct[LUMA_8x32 ] = dct_c_8x32_avx2; dctf->dct[LUMA_32x8 ] = dct_c_32x8_avx2; dctf->dct[LUMA_16x16] = dct_c_16x16_avx2; @@ -1751,6 +1750,7 @@ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf) dctf->dct_half[LUMA_64x64] = dct_c_64x64_half_avx2; } #endif // ARCH_X86_64 +#endif #else UNUSED_PARAMETER(cpuid); #endif // if HAVE_MMX diff --git a/source/common/vec/intrinsic.h b/source/common/vec/intrinsic.h index 27a68ac..ec0a1bd 100644 --- a/source/common/vec/intrinsic.h +++ b/source/common/vec/intrinsic.h @@ -47,7 +47,7 @@ #define M128_I16(mx, idx) _mm_extract_epi16(mx, idx) -#if _MSC_VER // 解决vs下immintrin.h中没有定义这些函数的问题 +#if _MSC_VER // 瑙e喅vs涓媔mmintrin.h涓病鏈夊畾涔夎繖浜涘嚱鏁扮殑闂 #define _mm256_extract_epi64(a, i) (a.m256i_i64[i]) #define _mm256_extract_epi32(a, i) (a.m256i_i32[i]) #define _mm256_extract_epi16(a, i) (a.m256i_i16[i]) @@ -61,7 +61,7 @@ #define _mm256_insert_epi16(a, value, index) (a.m256i_i16[index] = value) #define _mm256_insert_epi8 (a, value, index) (a.m256i_i8 [index] = value) #else -// 添加部分gcc中缺少的avx函数定义 +// 娣诲姞閮ㄥ垎gcc涓己灏戠殑avx鍑芥暟瀹氫箟 #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \ @@ -98,40 +98,43 @@ ALIGN16(extern const int8_t tab_coeff_mode_11[64][16]); #define intpl_copy_block_sse128 FPFX(intpl_copy_block_sse128) void intpl_copy_block_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height); #define intpl_luma_block_hor_sse128 FPFX(intpl_luma_block_hor_sse128) -void intpl_luma_block_hor_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_luma_block_hor_sse128 (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_sse128 FPFX(intpl_luma_block_ver_sse128) -void intpl_luma_block_ver_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_luma_block_ver_sse128 (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ext_sse128 FPFX(intpl_luma_block_ext_sse128) -void intpl_luma_block_ext_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); +void intpl_luma_block_ext_sse128 (xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_luma_hor_sse128 FPFX(intpl_luma_hor_sse128) -void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_luma_hor_sse128(xavs2_t *h, pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_ver_sse128 FPFX(intpl_luma_ver_sse128) -void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_luma_ver_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_ext_sse128 FPFX(intpl_luma_ext_sse128) -void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); +void intpl_luma_ext_sse128(xavs2_t *h, pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); +#if defined(__AVX2__) #define intpl_luma_hor_avx2 FPFX(intpl_luma_hor_avx2) void intpl_luma_hor_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff); #define intpl_luma_ver_avx2 FPFX(intpl_luma_ver_avx2) void intpl_luma_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff); #define intpl_luma_ext_avx2 FPFX(intpl_luma_ext_avx2) void intpl_luma_ext_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); +#endif #define intpl_luma_hor_x3_sse128 FPFX(intpl_luma_hor_x3_sse128) -void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); +void intpl_luma_hor_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ver_x3_sse128 FPFX(intpl_luma_ver_x3_sse128) -void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff); +void intpl_luma_ver_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ext_x3_sse128 FPFX(intpl_luma_ext_x3_sse128) -void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); +void intpl_luma_ext_x3_sse128(xavs2_t *h, pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); #define intpl_chroma_block_hor_sse128 FPFX(intpl_chroma_block_hor_sse128) -void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_chroma_block_hor_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ver_sse128 FPFX(intpl_chroma_block_ver_sse128) -void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); +void intpl_chroma_block_ver_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ext_sse128 FPFX(intpl_chroma_block_ext_sse128) -void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); +void intpl_chroma_block_ext_sse128(xavs2_t *h, pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); +#if defined(__AVX2__) #define intpl_luma_block_hor_avx2 FPFX(intpl_luma_block_hor_avx2) void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_avx2 FPFX(intpl_luma_block_ver_avx2) @@ -152,6 +155,7 @@ void intpl_luma_hor_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], void intpl_luma_ver_x3_avx2(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ext_x3_avx2 FPFX(intpl_luma_ext_x3_avx2) void intpl_luma_ext_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); +#endif /* memory operation */ #define cpy_pel_I420_to_uchar_YUY2_sse128 FPFX(cpy_pel_I420_to_uchar_YUY2_sse128) @@ -160,12 +164,13 @@ void cpy_pel_I420_to_uchar_YUY2_sse128(const pel_t *srcy, const pel_t *srcu, con void add_pel_clip_sse128(const pel_t *src1, int i_src1, const int16_t *src2, int i_src2, pel_t *dst, int i_dst, int width, int height, int bit_depth); #define xavs2_pixel_average_sse128 FPFX(pixel_average_sse128) void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); -#define xavs2_pixel_average_avx FPFX(pixel_average_avx) -void xavs2_pixel_average_avx (pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); #define padding_rows_sse128 FPFX(padding_rows_sse128) void padding_rows_sse128 (pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse128 FPFX(padding_rows_lr_sse128) void padding_rows_lr_sse128(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); +#if defined(__AVX2__) +#define xavs2_pixel_average_avx FPFX(pixel_average_avx) +void xavs2_pixel_average_avx (pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); #define padding_rows_sse256 FPFX(padding_rows_sse256) void padding_rows_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_sse256_10bit FPFX(padding_rows_sse256_10bit) @@ -174,15 +179,15 @@ void padding_rows_sse256_10bit(pel_t *src, int i_src, int width, int height, int void padding_rows_lr_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse256_10bit FPFX(padding_rows_lr_sse256) void padding_rows_lr_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); - -#define xavs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2) -void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n); #define xavs2_memzero_aligned_c_avx FPFX(memzero_aligned_c_avx) void *xavs2_memzero_aligned_c_avx (void *dst, size_t n); -#define xavs2_mem_repeat_i_c_sse2 FPFX(mem_repeat_i_c_sse2) -void xavs2_mem_repeat_i_c_sse2 (void *dst, int val, size_t count); #define xavs2_mem_repeat_i_c_avx FPFX(mem_repeat_i_c_avx) void xavs2_mem_repeat_i_c_avx (void *dst, int val, size_t count); +#endif +#define xavs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2) +void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n); +#define xavs2_mem_repeat_i_c_sse2 FPFX(mem_repeat_i_c_sse2) +void xavs2_mem_repeat_i_c_sse2 (void *dst, int val, size_t count); #define xavs2_memcpy_aligned_c_sse2 FPFX(memcpy_aligned_c_sse2) void *xavs2_memcpy_aligned_c_sse2 (void *dst, const void *src, size_t n); @@ -196,6 +201,7 @@ void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int A void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, unsigned char *flt_flag); //--------avx2-------- add by zhangjiaqi 2016-12-02 +#if defined(__AVX2__) #define deblock_edge_hor_avx2 FPFX(deblock_edge_hor_avx2) void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_avx2 FPFX(deblock_edge_ver_avx2) @@ -204,32 +210,34 @@ void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8 void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_c_avx2 FPFX(deblock_edge_ver_c_avx2) void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); +#endif #define dct_c_4x4_sse128 FPFX(dct_c_4x4_sse128) -void dct_c_4x4_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_4x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x8_sse128 FPFX(dct_c_8x8_sse128) -void dct_c_8x8_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_8x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x16_sse128 FPFX(dct_c_16x16_sse128) -void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_16x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x32_sse128 FPFX(dct_c_32x32_sse128) -void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_32x32_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_sse128 FPFX(dct_c_64x64_sse128) -void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_4x16_sse128 FPFX(dct_c_4x16_sse128) -void dct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_4x16_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x32_sse128 FPFX(dct_c_8x32_sse128) -void dct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_8x32_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x4_sse128 FPFX(dct_c_16x4_sse128) -void dct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_16x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x8_sse128 FPFX(dct_c_32x8_sse128) -void dct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_32x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x16_sse128 FPFX(dct_c_64x16_sse128) -void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x64_sse128 FPFX(dct_c_16x64_sse128) -void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); //futl +#if defined(__AVX2__) #define dct_c_4x4_avx2 FPFX(dct_c_4x4_avx2) void dct_c_4x4_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x8_avx2 FPFX(dct_c_8x8_avx2) @@ -255,54 +263,57 @@ void dct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_src); void dct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x64_avx2 FPFX(dct_c_16x64_avx2) void dct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_src); +#endif /* half DCT, only keep low frequency coefficients */ #define dct_c_32x32_half_sse128 FPFX(dct_c_32x32_half_sse128) -void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_32x32_half_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_half_sse128 FPFX(dct_c_64x64_half_sse128) -void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src); +void dct_c_64x64_half_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); +#if defined(__AVX2__) #define dct_c_32x32_half_avx2 FPFX(dct_c_32x32_half_avx2) void dct_c_32x32_half_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_half_avx2 FPFX(dct_c_64x64_half_avx2) void dct_c_64x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_src); +#endif #define transform_4x4_2nd_sse128 FPFX(transform_4x4_2nd_sse128) -void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff); +void transform_4x4_2nd_sse128(xavs2_t *h, coeff_t *coeff, int i_coeff); #define transform_2nd_sse128 FPFX(transform_2nd_sse128) void transform_2nd_sse128 (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); #define idct_c_4x4_sse128 FPFX(idct_c_4x4_sse128) -void idct_c_4x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_4x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_8x8_sse128 FPFX(idct_c_8x8_sse128) -void idct_c_8x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_8x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x16_sse128 FPFX(idct_c_16x16_sse128) -void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_16x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_32x32_sse128 FPFX(idct_c_32x32_sse128) -void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_32x32_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x64_sse128 FPFX(idct_c_64x64_sse128) -void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x4_sse128 FPFX(idct_c_16x4_sse128) -void idct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_16x4_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_32x8_sse128 FPFX(idct_c_32x8_sse128) -void idct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_32x8_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x16_sse128 FPFX(idct_c_64x16_sse128) -void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_4x16_sse128 FPFX(idct_c_4x16_sse128) -void idct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_4x16_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_8x32_sse128 FPFX(idct_c_8x32_sse128) -void idct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_8x32_sse128 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x64_sse128 FPFX(idct_c_16x64_sse128) -void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); +void idct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define inv_transform_4x4_2nd_sse128 FPFX(inv_transform_4x4_2nd_sse128) -void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff); +void inv_transform_4x4_2nd_sse128(xavs2_t *h, coeff_t *coeff, int i_coeff); #define inv_transform_2nd_sse128 FPFX(inv_transform_2nd_sse128) void inv_transform_2nd_sse128 (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); - //zhangjiaqi add 2016.11.30 avx2 +#if defined(__AVX2__) #define idct_c_8x8_avx2 FPFX(idct_c_8x8_avx2) void idct_c_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x16_avx2 FPFX(idct_c_16x16_avx2) @@ -315,6 +326,7 @@ void idct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); void idct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x64_avx2 FPFX(idct_c_16x64_avx2) void idct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); +#endif // scan the cg coefficient #define coeff_scan_4x4_xy_sse128 FPFX(coeff_scan_4x4_xy_sse128) @@ -326,111 +338,153 @@ void coeff_scan_4x4_yx_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift) void coeff_scan4_xy_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define coeff_scan4_yx_sse128 FPFX(coeff_scan4_yx_sse128) void coeff_scan4_yx_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); +#if defined(__AVX2__) #define coeff_scan4_xy_avx FPFX(coeff_scan4_xy_avx) void coeff_scan4_xy_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define coeff_scan4_yx_avx FPFX(coeff_scan4_yx_avx) void coeff_scan4_yx_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); - +#endif #define abs_coeff_sse128 FPFX(abs_coeff_sse128) void abs_coeff_sse128(coeff_t *dst, const coeff_t *src, const int i_coef); #define add_sign_sse128 FPFX(add_sign_sse128) int add_sign_sse128(coeff_t *dst, const coeff_t *abs_val, const int i_coef); -#define quant_c_avx2 FPFX(quant_c_avx2) -int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); -#define dequant_c_avx2 FPFX(dequant_c_avx2) -void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift); #define quant_c_sse128 FPFX(quant_c_avx2) int quant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); #define dequant_c_sse128 FPFX(dequant_c_sse128) void dequant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); +#if defined(__AVX2__) +#define quant_c_avx2 FPFX(quant_c_avx2) +int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); +#define dequant_c_avx2 FPFX(dequant_c_avx2) +void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift); #define abs_coeff_avx2 FPFX(abs_coeff_avx2) void abs_coeff_avx2(coeff_t *dst, const coeff_t *src, const int i_coef); #define add_sign_avx2 FPFX(add_sign_avx2) int add_sign_avx2(coeff_t *dst, const coeff_t *abs_val, const int i_coef); - -#define SAO_on_block_sse128 FPFX(SAO_on_block_sse128) -void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, - int i_src, int i_block_w, int i_block_h, - int *lcu_avail, SAOBlkParam *sao_param); #define SAO_on_block_sse256 FPFX(SAO_on_block_sse256) void SAO_on_block_sse256(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param); +#endif + +#if !HIGH_BIT_DEPTH +#define SAO_on_block_sse128 FPFX(SAO_on_block_sse128) +void SAO_on_block_sse128(xavs2_t *h, pel_t *p_dst, int i_dst, pel_t *p_src, + int i_src, int i_block_w, int i_block_h, + int *lcu_avail, SAOBlkParam *sao_param); +#else +#define SAO_on_block_bo_sse128 FPFX(SAO_on_block_bo_sse128) +void SAO_on_block_bo_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const SAOBlkParam* saoBlkParam); +#define SAO_on_block_eo_0_sse128 FPFX(SAO_on_block_eo_0_sse128) +void SAO_on_block_eo_0_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset); +#define SAO_on_block_eo_45_sse128 FPFX(SAO_on_block_eo_45_sse128) +void SAO_on_block_eo_45_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset); +#define SAO_on_block_eo_90_sse128 FPFX(SAO_on_block_eo_90_sse128) +void SAO_on_block_eo_90_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset); +#define SAO_on_block_eo_135_sse128 FPFX(SAO_on_block_eo_135_sse128) +void SAO_on_block_eo_135_sse128(xavs2_t *h, pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset); +#endif #define alf_flt_one_block_sse128 FPFX(alf_flt_one_block_sse128) -void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +#if !HIGH_BIT_DEPTH +void alf_flt_one_block_sse128(xavs2_t *h, pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail); +#else +void alf_flt_one_block_sse128(xavs2_t *h, pel_t* p_dst, const pel_t* p_src, int stride, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int* alf_coeff, int b_top_avail, int b_down_avail); +#endif +#if defined(__AVX2__) +#define alf_filter_block_avx2_10bit FPFX(alf_filter_block_avx2_10bit) +void alf_filter_block_avx2(xavs2_t *h, pel_t* p_dst, const pel_t* p_src, int stride, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int* alf_coeff, int b_top_avail, int b_down_avail); +#endif +#if !HIGH_BIT_DEPTH #define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128) -void intra_pred_dc_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_dc_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128) -void intra_pred_plane_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_plane_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128) -void intra_pred_bilinear_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_bilinear_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128) -void intra_pred_hor_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_hor_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128) -void intra_pred_ver_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ver_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +#else +#define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128) +void intra_pred_dc_sse128 (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight); +#define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128) +void intra_pred_plane_sse128 (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight); +#define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128) +void intra_pred_bilinear_sse128 (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight); +#define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128) +void intra_pred_hor_sse128 (xavs2_t *h, pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight); +#define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128) +void intra_pred_ver_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +#endif #define intra_pred_ang_x_3_sse128 FPFX(intra_pred_ang_x_3_sse128) -void intra_pred_ang_x_3_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_3_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_4_sse128 FPFX(intra_pred_ang_x_4_sse128) -void intra_pred_ang_x_4_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_4_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_5_sse128 FPFX(intra_pred_ang_x_5_sse128) -void intra_pred_ang_x_5_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_5_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_6_sse128 FPFX(intra_pred_ang_x_6_sse128) -void intra_pred_ang_x_6_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_6_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_7_sse128 FPFX(intra_pred_ang_x_7_sse128) -void intra_pred_ang_x_7_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_7_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_8_sse128 FPFX(intra_pred_ang_x_8_sse128) -void intra_pred_ang_x_8_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_8_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_9_sse128 FPFX(intra_pred_ang_x_9_sse128) -void intra_pred_ang_x_9_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_9_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_10_sse128 FPFX(intra_pred_ang_x_10_sse128) -void intra_pred_ang_x_10_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_10_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_11_sse128 FPFX(intra_pred_ang_x_11_sse128) -void intra_pred_ang_x_11_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_x_11_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_25_sse128 FPFX(intra_pred_ang_y_25_sse128) -void intra_pred_ang_y_25_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_25_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_26_sse128 FPFX(intra_pred_ang_y_26_sse128) -void intra_pred_ang_y_26_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_26_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_28_sse128 FPFX(intra_pred_ang_y_28_sse128) -void intra_pred_ang_y_28_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_28_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_30_sse128 FPFX(intra_pred_ang_y_30_sse128) -void intra_pred_ang_y_30_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_30_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_31_sse128 FPFX(intra_pred_ang_y_31_sse128) -void intra_pred_ang_y_31_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_31_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_sse128 FPFX(intra_pred_ang_y_32_sse128) -void intra_pred_ang_y_32_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_y_32_sse128 (xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_13_sse128 FPFX(intra_pred_ang_xy_13_sse128) -void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_13_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_14_sse128 FPFX(intra_pred_ang_xy_14_sse128) -void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_14_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_16_sse128 FPFX(intra_pred_ang_xy_16_sse128) -void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_16_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_18_sse128 FPFX(intra_pred_ang_xy_18_sse128) -void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_18_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_20_sse128 FPFX(intra_pred_ang_xy_20_sse128) -void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_20_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_22_sse128 FPFX(intra_pred_ang_xy_22_sse128) -void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_22_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_23_sse128 FPFX(intra_pred_ang_xy_23_sse128) -void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); +void intra_pred_ang_xy_23_sse128(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define fill_edge_samples_0_sse128 FPFX(fill_edge_samples_0_sse128) -void fill_edge_samples_0_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); +void fill_edge_samples_0_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_x_sse128 FPFX(fill_edge_samples_x_sse128) -void fill_edge_samples_x_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); +void fill_edge_samples_x_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_y_sse128 FPFX(fill_edge_samples_y_sse128) -void fill_edge_samples_y_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); +void fill_edge_samples_y_sse128 (xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_xy_sse128 FPFX(fill_edge_samples_xy_sse128) -void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); +void fill_edge_samples_xy_sse128(xavs2_t *h, const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); //intra prediction avx functions +#if defined(__AVX2__) #define intra_pred_ver_avx FPFX(intra_pred_ver_avx) void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_avx FPFX(intra_pred_hor_avx) @@ -487,7 +541,7 @@ void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_avx FPFX(intra_pred_ang_y_32_avx) void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); - +#endif #define mad_16x16_sse128 FPFX(mad_16x16_sse128) diff --git a/source/common/vec/intrinsic_alf.c b/source/common/vec/intrinsic_alf.c index 00bc61c..88a4100 100644 --- a/source/common/vec/intrinsic_alf.c +++ b/source/common/vec/intrinsic_alf.c @@ -44,7 +44,9 @@ #include -void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +#if !HIGH_BIT_DEPTH +void alf_flt_one_block_sse128(xavs2_t *h, + pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { @@ -58,7 +60,7 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, __m128i mSwitch1, mSwitch2, mSwitch3, mSwitch4, mSwitch5; __m128i mAddOffset; __m128i mZero = _mm_set1_epi16(0); - __m128i mMax = _mm_set1_epi16((short)(max_pel_value)); + __m128i mMax = _mm_set1_epi16((short)((1 << h->param->input_sample_bit_depth) - 1)); __m128i mask; int i, j; @@ -113,15 +115,15 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, T01 = _mm_loadu_si128((__m128i*)&p_src5[j]); E00 = _mm_unpacklo_epi8(T00, T01); E01 = _mm_unpackhi_epi8(T00, T01); - S00 = _mm_maddubs_epi16(E00, C0);//前8个像素所有C0*P0的结果 - S01 = _mm_maddubs_epi16(E01, C0);//后8个像素所有C0*P0的结果 + S00 = _mm_maddubs_epi16(E00, C0);//鍓8涓儚绱犳墍鏈塁0*P0鐨勭粨鏋 + S01 = _mm_maddubs_epi16(E01, C0);//鍚8涓儚绱犳墍鏈塁0*P0鐨勭粨鏋 T10 = _mm_loadu_si128((__m128i*)&p_src4[j]); T11 = _mm_loadu_si128((__m128i*)&p_src3[j]); E10 = _mm_unpacklo_epi8(T10, T11); E11 = _mm_unpackhi_epi8(T10, T11); - S10 = _mm_maddubs_epi16(E10, C1);//前8个像素所有C1*P1的结果 - S11 = _mm_maddubs_epi16(E11, C1);//后8个像素所有C1*P1的结果 + S10 = _mm_maddubs_epi16(E10, C1);//鍓8涓儚绱犳墍鏈塁1*P1鐨勭粨鏋 + S11 = _mm_maddubs_epi16(E11, C1);//鍚8涓儚绱犳墍鏈塁1*P1鐨勭粨鏋 T20 = _mm_loadu_si128((__m128i*)&p_src2[j - 1]); T21 = _mm_loadu_si128((__m128i*)&p_src1[j + 1]); @@ -161,26 +163,26 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, S8 = _mm_maddubs_epi16(T4, C33); S50 = _mm_hadds_epi16(S5, S6); S51 = _mm_hadds_epi16(S7, S8); - S5 = _mm_hadds_epi16(S50, S51);//前8个 + S5 = _mm_hadds_epi16(S50, S51);//鍓8涓 S4 = _mm_maddubs_epi16(T5, C33); S6 = _mm_maddubs_epi16(T6, C33); S7 = _mm_maddubs_epi16(T7, C33); S8 = _mm_maddubs_epi16(T8, C33); S60 = _mm_hadds_epi16(S4, S6); S61 = _mm_hadds_epi16(S7, S8); - S6 = _mm_hadds_epi16(S60, S61);//后8个 + S6 = _mm_hadds_epi16(S60, S61);//鍚8涓 S0 = _mm_adds_epi16(S00, S10); S1 = _mm_adds_epi16(S30, S20); S2 = _mm_adds_epi16(S40, S5); S3 = _mm_adds_epi16(S1, S0); - SS1 = _mm_adds_epi16(S2, S3);//前8个 + SS1 = _mm_adds_epi16(S2, S3);//鍓8涓 S0 = _mm_adds_epi16(S01, S11); S1 = _mm_adds_epi16(S31, S21); S2 = _mm_adds_epi16(S41, S6); S3 = _mm_adds_epi16(S1, S0); - SS2 = _mm_adds_epi16(S2, S3);//后8个 + SS2 = _mm_adds_epi16(S2, S3);//鍚8涓 SS1 = _mm_adds_epi16(SS1, mAddOffset); @@ -206,5 +208,309 @@ void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, p_dst += i_dst; } } +#else +/***************************************************************************** +* Copyright (C) 2016 uavs2dec project, +* National Engineering Laboratory for Video Technology(Shenzhen), +* Digital Media R&D Center at Peking University Shenzhen Graduate School, China +* Project Leader: Ronggang Wang +* +* Main Authors: Zhenyu Wang , Kui Fan +* Shenghao Zhang <1219759986@qq.com>拢卢 Bingjie Han, Kaili Yao, Hongbin Cao, Yueming Wang, +* Jing Su, Jiaying Yan, Junru Li +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at rgwang@pkusz.edu.cn. +*****************************************************************************/ + +void alf_flt_one_block_sse128(xavs2_t *h, + pel_t* p_dst, const pel_t* p_src, int stride, + int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, + int* alf_coeff, int b_top_avail, int b_down_avail) +{ + const pel_t* p_src1, * p_src2, * p_src3, * p_src4, * p_src5, * p_src6; + + __m128i T00, T01, T10, T11, T20, T21, T30, T31, T40, T41; + __m128i E00, E01, E10, E11, E20, E21, E30, E31, E40, E41; + __m128i C0, C1, C2, C3, C4, C5, C6, C7, C8; + __m128i S00, S01, S10, S11, S20, S21, S30, S31, S40, S41, S50, S51, S60, S61, SS1, SS2, S, S70, S71, S80, S81; + __m128i mAddOffset; + __m128i mask; + __m128i zero = _mm_setzero_si128(); + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i max_val = _mm_set1_epi16(max_pixel); + + int i, j; + int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; + int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); + int lcu_pix_xEnd = lcu_pix_x + lcu_width; + + p_src += (startPos * stride); + p_dst += (startPos * stride); + + C0 = _mm_set1_epi16((pel_t)alf_coeff[0]); + C1 = _mm_set1_epi16((pel_t)alf_coeff[1]); + C2 = _mm_set1_epi16((pel_t)alf_coeff[2]); + C3 = _mm_set1_epi16((pel_t)alf_coeff[3]); + C4 = _mm_set1_epi16((pel_t)alf_coeff[4]); + C5 = _mm_set1_epi16((pel_t)alf_coeff[5]); + C6 = _mm_set1_epi16((pel_t)alf_coeff[6]); + C7 = _mm_set1_epi16((pel_t)alf_coeff[7]); + C8 = _mm_set1_epi16((pel_t)alf_coeff[8]); + + mAddOffset = _mm_set1_epi32(32); + + if (lcu_width & 7) { + int lcu_pix_xEnd8 = lcu_pix_xEnd - (lcu_width & 0x07); + mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(lcu_width & 7) - 1])); + for (i = startPos; i < endPos; i++) { + int yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 1); + int yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 1); + p_src1 = p_src + (yBottom - i) * stride; + p_src2 = p_src + (yUp - i) * stride; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 2); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 2); + p_src3 = p_src + (yBottom - i) * stride; + p_src4 = p_src + (yUp - i) * stride; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 3); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 3); + p_src5 = p_src + (yBottom - i) * stride; + p_src6 = p_src + (yUp - i) * stride; + + for (j = lcu_pix_x; j < lcu_pix_xEnd; j += 8) { + T00 = _mm_loadu_si128((__m128i*) & p_src6[j]); + T01 = _mm_loadu_si128((__m128i*) & p_src5[j]); + E00 = _mm_unpacklo_epi16(T00, T01); + E01 = _mm_unpackhi_epi16(T00, T01); + S00 = _mm_madd_epi16(E00, C0); + S01 = _mm_madd_epi16(E01, C0); + + T10 = _mm_loadu_si128((__m128i*) & p_src4[j]); + T11 = _mm_loadu_si128((__m128i*) & p_src3[j]); + E10 = _mm_unpacklo_epi16(T10, T11); + E11 = _mm_unpackhi_epi16(T10, T11); + S10 = _mm_madd_epi16(E10, C1); + S11 = _mm_madd_epi16(E11, C1); + + T20 = _mm_loadu_si128((__m128i*) & p_src2[j - 1]); + T21 = _mm_loadu_si128((__m128i*) & p_src1[j + 1]); + E20 = _mm_unpacklo_epi16(T20, T21); + E21 = _mm_unpackhi_epi16(T20, T21); + S20 = _mm_madd_epi16(E20, C2); + S21 = _mm_madd_epi16(E21, C2); + + T30 = _mm_loadu_si128((__m128i*) & p_src2[j]); + T31 = _mm_loadu_si128((__m128i*) & p_src1[j]); + E30 = _mm_unpacklo_epi16(T30, T31); + E31 = _mm_unpackhi_epi16(T30, T31); + S30 = _mm_madd_epi16(E30, C3); + S31 = _mm_madd_epi16(E31, C3); + + T40 = _mm_loadu_si128((__m128i*) & p_src2[j + 1]); + T41 = _mm_loadu_si128((__m128i*) & p_src1[j - 1]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S40 = _mm_madd_epi16(E40, C4); + S41 = _mm_madd_epi16(E41, C4); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 3]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 3]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S50 = _mm_madd_epi16(E40, C5); + S51 = _mm_madd_epi16(E41, C5); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 2]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 2]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S60 = _mm_madd_epi16(E40, C6); + S61 = _mm_madd_epi16(E41, C6); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 1]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 1]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S70 = _mm_madd_epi16(E40, C7); + S71 = _mm_madd_epi16(E41, C7); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j]); + E40 = _mm_unpacklo_epi16(T40, zero); + E41 = _mm_unpackhi_epi16(T40, zero); + S80 = _mm_madd_epi16(E40, C8); + S81 = _mm_madd_epi16(E41, C8); + + SS1 = _mm_add_epi32(S00, S10); + SS1 = _mm_add_epi32(SS1, S20); + SS1 = _mm_add_epi32(SS1, S30); + SS1 = _mm_add_epi32(SS1, S40); + SS1 = _mm_add_epi32(SS1, S50); + SS1 = _mm_add_epi32(SS1, S60); + SS1 = _mm_add_epi32(SS1, S70); + SS1 = _mm_add_epi32(SS1, S80); + + SS2 = _mm_add_epi32(S01, S11); + SS2 = _mm_add_epi32(SS2, S21); + SS2 = _mm_add_epi32(SS2, S31); + SS2 = _mm_add_epi32(SS2, S41); + SS2 = _mm_add_epi32(SS2, S51); + SS2 = _mm_add_epi32(SS2, S61); + SS2 = _mm_add_epi32(SS2, S71); + SS2 = _mm_add_epi32(SS2, S81); + + SS1 = _mm_add_epi32(SS1, mAddOffset); + SS1 = _mm_srai_epi32(SS1, 6); + + SS2 = _mm_add_epi32(SS2, mAddOffset); + SS2 = _mm_srai_epi32(SS2, 6); + + S = _mm_packus_epi32(SS1, SS2); + S = _mm_min_epu16(S, max_val); + if (j != lcu_pix_xEnd8) { + _mm_storeu_si128((__m128i*)(p_dst + j), S); + } + else { + _mm_maskmoveu_si128(S, mask, (char*)(p_dst + j)); + break; + } + } + + p_src += stride; + p_dst += stride; + } + } + else { + for (i = startPos; i < endPos; i++) { + int yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 1); + int yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 1); + p_src1 = p_src + (yBottom - i) * stride; + p_src2 = p_src + (yUp - i) * stride; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 2); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 2); + p_src3 = p_src + (yBottom - i) * stride; + p_src4 = p_src + (yUp - i) * stride; + + yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 3); + yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 3); + p_src5 = p_src + (yBottom - i) * stride; + p_src6 = p_src + (yUp - i) * stride; + + for (j = lcu_pix_x; j < lcu_pix_xEnd; j += 8) { + T00 = _mm_loadu_si128((__m128i*) & p_src6[j]); + T01 = _mm_loadu_si128((__m128i*) & p_src5[j]); + E00 = _mm_unpacklo_epi16(T00, T01); + E01 = _mm_unpackhi_epi16(T00, T01); + S00 = _mm_madd_epi16(E00, C0); + S01 = _mm_madd_epi16(E01, C0); + + T10 = _mm_loadu_si128((__m128i*) & p_src4[j]); + T11 = _mm_loadu_si128((__m128i*) & p_src3[j]); + E10 = _mm_unpacklo_epi16(T10, T11); + E11 = _mm_unpackhi_epi16(T10, T11); + S10 = _mm_madd_epi16(E10, C1); + S11 = _mm_madd_epi16(E11, C1); + + T20 = _mm_loadu_si128((__m128i*) & p_src2[j - 1]); + T21 = _mm_loadu_si128((__m128i*) & p_src1[j + 1]); + E20 = _mm_unpacklo_epi16(T20, T21); + E21 = _mm_unpackhi_epi16(T20, T21); + S20 = _mm_madd_epi16(E20, C2); + S21 = _mm_madd_epi16(E21, C2); + + T30 = _mm_loadu_si128((__m128i*) & p_src2[j]); + T31 = _mm_loadu_si128((__m128i*) & p_src1[j]); + E30 = _mm_unpacklo_epi16(T30, T31); + E31 = _mm_unpackhi_epi16(T30, T31); + S30 = _mm_madd_epi16(E30, C3); + S31 = _mm_madd_epi16(E31, C3); + + T40 = _mm_loadu_si128((__m128i*) & p_src2[j + 1]); + T41 = _mm_loadu_si128((__m128i*) & p_src1[j - 1]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S40 = _mm_madd_epi16(E40, C4); + S41 = _mm_madd_epi16(E41, C4); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 3]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 3]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S50 = _mm_madd_epi16(E40, C5); + S51 = _mm_madd_epi16(E41, C5); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 2]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 2]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S60 = _mm_madd_epi16(E40, C6); + S61 = _mm_madd_epi16(E41, C6); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j - 1]); + T41 = _mm_loadu_si128((__m128i*) & p_src[j + 1]); + E40 = _mm_unpacklo_epi16(T40, T41); + E41 = _mm_unpackhi_epi16(T40, T41); + S70 = _mm_madd_epi16(E40, C7); + S71 = _mm_madd_epi16(E41, C7); + + T40 = _mm_loadu_si128((__m128i*) & p_src[j]); + E40 = _mm_unpacklo_epi16(T40, zero); + E41 = _mm_unpackhi_epi16(T40, zero); + S80 = _mm_madd_epi16(E40, C8); + S81 = _mm_madd_epi16(E41, C8); + + SS1 = _mm_add_epi32(S00, S10); + SS1 = _mm_add_epi32(SS1, S20); + SS1 = _mm_add_epi32(SS1, S30); + SS1 = _mm_add_epi32(SS1, S40); + SS1 = _mm_add_epi32(SS1, S50); + SS1 = _mm_add_epi32(SS1, S60); + SS1 = _mm_add_epi32(SS1, S70); + SS1 = _mm_add_epi32(SS1, S80); + + SS2 = _mm_add_epi32(S01, S11); + SS2 = _mm_add_epi32(SS2, S21); + SS2 = _mm_add_epi32(SS2, S31); + SS2 = _mm_add_epi32(SS2, S41); + SS2 = _mm_add_epi32(SS2, S51); + SS2 = _mm_add_epi32(SS2, S61); + SS2 = _mm_add_epi32(SS2, S71); + SS2 = _mm_add_epi32(SS2, S81); + + SS1 = _mm_add_epi32(SS1, mAddOffset); + SS1 = _mm_srai_epi32(SS1, 6); + + SS2 = _mm_add_epi32(SS2, mAddOffset); + SS2 = _mm_srai_epi32(SS2, 6); + + S = _mm_packus_epi32(SS1, SS2); + S = _mm_min_epu16(S, max_val); + + _mm_storeu_si128((__m128i*)(p_dst + j), S); + + } + + p_src += stride; + p_dst += stride; + } + } +} +#endif diff --git a/source/common/vec/intrinsic_dct.c b/source/common/vec/intrinsic_dct.c index f5126a3..dfd3ff1 100644 --- a/source/common/vec/intrinsic_dct.c +++ b/source/common/vec/intrinsic_dct.c @@ -42,6 +42,7 @@ #include "../basic_types.h" #include "../avs2_defs.h" +#include "../common.h" #include "intrinsic.h" void *xavs2_fast_memzero_mmx(void *dst, size_t n); @@ -396,9 +397,10 @@ ALIGN16(static const int16_t g_2TC_V[8 * (2 * SEC_TR_SIZE)]) = { /* --------------------------------------------------------------------------- futl change 2016.12.19*/ -void dct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_4x4_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; @@ -484,9 +486,10 @@ void dct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- futl change 2016.12.19*/ -void dct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_8x8_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int SHIFT1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + const int SHIFT1 = B8X8_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B8X8_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; @@ -685,9 +688,10 @@ void dct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_16x4_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + const int shift1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; const int shift2 = B16X16_IN_BIT + FACTO_BIT - 2; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; @@ -973,9 +977,10 @@ void dct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_4x16_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2; + const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2; const int ADD1 = (1 << SHIFT1) >> 1; const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT; const int ADD2 = (1 << SHIFT2) >> 1; @@ -1037,7 +1042,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) u11 = _mm_madd_epi16(t2, k_p17_p42); u13 = _mm_madd_epi16(t2, k_m42_p17); - //移位补偿 + //脪脝脦禄虏鹿鲁楼 u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1); u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1); u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1); @@ -1061,7 +1066,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) u21 = _mm_madd_epi16(t2, k_p17_p42); u23 = _mm_madd_epi16(t2, k_m42_p17); - //移位补偿 + //脪脝脦禄虏鹿鲁楼 u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1); u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1); u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1); @@ -1089,7 +1094,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) u11 = _mm_madd_epi16(t2, k_p17_p42); u13 = _mm_madd_epi16(t2, k_m42_p17); - //移位补偿 + //脪脝脦禄虏鹿鲁楼 u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1); u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1); u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1); @@ -1111,7 +1116,7 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) u22 = _mm_madd_epi16(t0, k_p32_m32); u21 = _mm_madd_epi16(t2, k_p17_p42); u23 = _mm_madd_epi16(t2, k_m42_p17); - //移位补偿 + //脪脝脦禄虏鹿鲁楼 u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1); u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1); u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1); @@ -1343,9 +1348,10 @@ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_16x16_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; @@ -1765,10 +1771,11 @@ void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_8x32_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { int i; - int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); + int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; @@ -1853,7 +1860,7 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) I5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ I6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ I7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - + TRANSPOSE_8x8(in0, in1, in2, in3, in4, in5, in6, in7) #undef TRANSPOSE_8x8 @@ -2131,7 +2138,7 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) \ tResult = _mm_packs_epi32(T60, TT60); \ _mm_storeu_si128((__m128i*)&dst[(dstPos)* 8], tResult); \ - + MAKE_ODD(44, 44, 44, 44, 0); MAKE_ODD(45, 45, 45, 45, 16); MAKE_ODD(46, 47, 46, 47, 8); @@ -2173,10 +2180,11 @@ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_32x8_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { int i; - int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; + int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT; int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01); const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; @@ -2535,9 +2543,10 @@ void dct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* --------------------------------------------------------------------------- */ -void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_32x32_half_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); + const int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; @@ -2809,7 +2818,7 @@ void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) /* clear result buffer */ xavs2_fast_memzero_mmx(dst, 32 * 32 * sizeof(coeff_t)); - // DCT2, 只保留前16行和前16列 + // DCT2, 脰禄卤拢脕么脟掳16脨脨潞脥脟掳16脕脨 for (i = 0; i < 16 / 4; i++) { // OPT_ME: to avoid register spill, I use matrix multiply, have other way? T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00] @@ -2920,9 +2929,10 @@ void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) } //optimize 32x32 size transform -void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_32x32_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { - const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); + const int shift1 = B32X32_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; @@ -2990,7 +3000,7 @@ void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) T07D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; - //_mm_load_si128((__m128i)tab_dct_16_0[1]) 换成 *((__m128i*)tab_dct_16_0[1]) + //_mm_load_si128((__m128i)tab_dct_16_0[1]) 禄禄鲁脡 *((__m128i*)tab_dct_16_0[1]) T00A = _mm_shuffle_epi8(T00A, *((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] T00B = _mm_shuffle_epi8(T00B, *((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] T00C = _mm_shuffle_epi8(T00C, *((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] @@ -3087,7 +3097,7 @@ void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); - im[0][i] = T60;//16个0到8行计算出来的变换系数(16 bit per bit width) + im[0][i] = T60;//16赂枚0碌陆8脨脨录脝脣茫鲁枚脌麓碌脛卤盲禄禄脧碌脢媒(16 bit per bit width) T50 = _mm_hsub_epi32(T40, T41); T51 = _mm_hsub_epi32(T42, T43); @@ -3890,9 +3900,10 @@ void transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top, in /* --------------------------------------------------------------------------- */ -void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) +void transform_4x4_2nd_sse128(xavs2_t *h, + coeff_t *coeff, int i_coeff) { - const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1; + const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + h->param->input_sample_bit_depth + 1 - LIMIT_BIT + 1; const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT + 1; const int ADD1 = 1 << (SHIFT1 - 1); const int ADD2 = 1 << (SHIFT2 - 1); @@ -3998,7 +4009,7 @@ void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) -// transpose 8x8 & transpose 16x16(矩阵转置) +// transpose 8x8 & transpose 16x16(戮脴脮贸脳陋脰脙) #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ @@ -4024,22 +4035,22 @@ void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - + #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \ TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \ TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \ TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \ TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \ - + void wavelet_16x64_sse128(coeff_t *coeff) { - //锟斤拷锟斤拷 16*64 + //茂驴陆茂驴陆茂驴陆茂驴陆 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; - //锟斤拷锟斤拷 64*16 + //茂驴陆茂驴陆茂驴陆茂驴陆 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; - //锟斤拷时 + //茂驴陆茂驴陆脢卤 __m128i B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15, B16, B17, B18, B19, B20, B21, B22, B23, B24, B25, B26, B27, B28, B29, B30, B31; __m128i B32, B33, B34, B35, B36, B37, B38, B39, B40, B41, B42, B43, B44, B45, B46, B47, B48, B49, B50, B51, B52, B53, B54, B55, B56, B57, B58, B59, B60, B61, B62, B63; @@ -4166,7 +4177,7 @@ void wavelet_16x64_sse128(coeff_t *coeff) } /* step 2: vertical transform */ - /* copy 转锟斤拷*/ + /* copy 脳陋茂驴陆茂驴陆*/ TRANSPOSE_8x8_16BIT(T00[0], T02[0], T04[0], T06[0], T08[0], T10[0], T12[0], T14[0], B00, B01, B02, B03, B04, B05, B06, B07); TRANSPOSE_8x8_16BIT(T00[1], T02[1], T04[1], T06[1], T08[1], T10[1], T12[1], T14[1], B08, B09, B10, B11, B12, B13, B14, B15); TRANSPOSE_8x8_16BIT(T00[2], T02[2], T04[2], T06[2], T08[2], T10[2], T12[2], T14[2], B16, B17, B18, B19, B20, B21, B22, B23); @@ -4290,16 +4301,16 @@ void wavelet_16x64_sse128(coeff_t *coeff) void wavelet_64x16_sse128(coeff_t *coeff) { - //锟斤拷锟斤拷 16*64 + //茂驴陆茂驴陆茂驴陆茂驴陆 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; - //锟斤拷锟斤拷 64*16 + //茂驴陆茂驴陆茂驴陆茂驴陆 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; - //锟斤拷时 64*16 + //茂驴陆茂驴陆脢卤 64*16 __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4]; - //临时 + //脕脵脢卤 __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; int i; @@ -4497,7 +4508,7 @@ void wavelet_64x16_sse128(coeff_t *coeff) V62[1] = _mm_add_epi16(V62[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V61[1], V63[1]), mAddOffset2), 2)); /* step 2: vertical transform */ - //转锟斤拷 + //脳陋茂驴陆茂驴陆 TRANSPOSE_8x8_16BIT(V00[0], V02[0], V04[0], V06[0], V08[0], V10[0], V12[0], V14[0], A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0]); TRANSPOSE_8x8_16BIT(V16[0], V18[0], V20[0], V22[0], V24[0], V26[0], V28[0], V30[0], A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1]); TRANSPOSE_8x8_16BIT(V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2]); @@ -4547,13 +4558,13 @@ void wavelet_64x16_sse128(coeff_t *coeff) void wavelet_64x64_sse128(coeff_t *coeff) { - //锟斤拷锟斤拷 16*64 + //茂驴陆茂驴陆茂驴陆茂驴陆 16*64 __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8]; - //锟斤拷锟斤拷 64*64 + //茂驴陆茂驴陆茂驴陆茂驴陆 64*64 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8]; - //临时 32*64 + //脕脵脢卤 32*64 __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4], A16[4], A17[4], A18[4], A19[4], A20[4], A21[4], A22[4], A23[4], A24[4], A25[4], A26[4], A27[4], A28[4], A29[4], A30[4], A31[4], A32[4], A33[4], A34[4], A35[4], A36[4], A37[4], A38[4], A39[4], A40[4], A41[4], A42[4], A43[4], A44[4], A45[4], A46[4], A47[4], A48[4], A49[4], A50[4], A51[4], A52[4], A53[4], A54[4], A55[4], A56[4], A57[4], A58[4], A59[4], A60[4], A61[4], A62[4], A63[4]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; @@ -4636,7 +4647,7 @@ void wavelet_64x64_sse128(coeff_t *coeff) T62[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 62]); T63[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 63]); } - //0-15锟斤拷转锟斤拷 + //0-15茂驴陆茂驴陆脳陋茂驴陆茂驴陆 TRANSPOSE_16x16_16BIT( T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1], @@ -4661,7 +4672,7 @@ void wavelet_64x64_sse128(coeff_t *coeff) V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1] ); - //16-31锟斤拷转锟斤拷 + //16-31茂驴陆茂驴陆脳陋茂驴陆茂驴陆 TRANSPOSE_16x16_16BIT( T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1], @@ -4686,7 +4697,7 @@ void wavelet_64x64_sse128(coeff_t *coeff) V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2], V48[3], V49[3], V50[3], V51[3], V52[3], V53[3], V54[3], V55[3], V56[3], V57[3], V58[3], V59[3], V60[3], V61[3], V62[3], V63[3] ); - //32-47锟斤拷转锟斤拷 + //32-47茂驴陆茂驴陆脳陋茂驴陆茂驴陆 TRANSPOSE_16x16_16BIT( T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1], @@ -4711,7 +4722,7 @@ void wavelet_64x64_sse128(coeff_t *coeff) V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4], V48[5], V49[5], V50[5], V51[5], V52[5], V53[5], V54[5], V55[5], V56[5], V57[5], V58[5], V59[5], V60[5], V61[5], V62[5], V63[5] ); - //48-63锟斤拷转锟斤拷 + //48-63茂驴陆茂驴陆脳陋茂驴陆茂驴陆 TRANSPOSE_16x16_16BIT( T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1], @@ -4840,7 +4851,7 @@ void wavelet_64x64_sse128(coeff_t *coeff) A48[0], A49[0], A50[0], A51[0], A52[0], A53[0], A54[0], A55[0], A56[0], A57[0], A58[0], A59[0], A60[0], A61[0], A62[0], A63[0], A48[1], A49[1], A50[1], A51[1], A52[1], A53[1], A54[1], A55[1], A56[1], A57[1], A58[1], A59[1], A60[1], A61[1], A62[1], A63[1] ); - //16-31锟斤拷 + //16-31茂驴陆茂驴陆 TRANSPOSE_16x16_16BIT( V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], V48[0], V50[0], V52[0], V54[0], V56[0], V58[0], V60[0], V62[0], V32[1], V34[1], V36[1], V38[1], V40[1], V42[1], V44[1], V46[1], V48[1], V50[1], V52[1], V54[1], V56[1], V58[1], V60[1], V62[1], @@ -4986,40 +4997,44 @@ void wavelet_64x64_sse128(coeff_t *coeff) /* --------------------------------------------------------------------------- */ -void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_64x64_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_sse128(dst); - dct_c_32x32_sse128(dst, dst, 32 | 1); + dct_c_32x32_sse128(h, dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ -void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_64x64_half_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_sse128(dst); - dct_c_32x32_half_sse128(dst, dst, 32 | 1); + dct_c_32x32_half_sse128(h, dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ -void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_64x16_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x16_sse128(dst); - dct_c_32x8_sse128(dst, dst, 32 | 0x01); + dct_c_32x8_sse128(h, dst, dst, 32 | 0x01); } /* --------------------------------------------------------------------------- */ -void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src) +void dct_c_16x64_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_16x64_sse128(dst); - dct_c_8x32_sse128(dst, dst, 8 | 0x01); + dct_c_8x32_sse128(h, dst, dst, 8 | 0x01); } diff --git a/source/common/vec/intrinsic_deblock.c b/source/common/vec/intrinsic_deblock.c index 563ef0c..bb4be29 100644 --- a/source/common/vec/intrinsic_deblock.c +++ b/source/common/vec/intrinsic_deblock.c @@ -42,6 +42,7 @@ #include #include +#if !HIGH_BIT_DEPTH void deblock_edge_ver_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp = SrcPtr - 4; @@ -805,3 +806,844 @@ void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int A ((int32_t*)(SrcPtrV - inc2))[0] = M128_I32(UL1, 1); ((int32_t*)(SrcPtrV + inc ))[0] = M128_I32(UR1, 1); } +#else +/***************************************************************************** +* Copyright (C) 2016 uavs2dec project, +* National Engineering Laboratory for Video Technology(Shenzhen), +* Digital Media R&D Center at Peking University Shenzhen Graduate School, China +* Project Leader: Ronggang Wang +* +* Main Authors: Zhenyu Wang , Kui Fan +* Shenghao Zhang <1219759986@qq.com>拢卢 Bingjie Han, Kaili Yao, Hongbin Cao, Yueming Wang, +* Jing Su, Jiaying Yan, Junru Li +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at rgwang@pkusz.edu.cn. +*****************************************************************************/ + +void deblock_edge_ver_sse128(pel_t* SrcPtr, int stride, int Alpha, int Beta, uint8_t* flt_flag) +{ + pel_t* pTmp = SrcPtr - 4; + int flag0 = flt_flag[0] ? -1 : 0; + int flag1 = flt_flag[1] ? -1 : 0; + __m128i TL0, TL1, TL2, TL3; + __m128i TR0, TR1, TR2, TR3; + __m128i TL0l, TL1l, TL2l; + __m128i TR0l, TR1l, TR2l; + __m128i V0, V1, V2, V3, V4, V5; + __m128i T0, T1, T2, T3, T4, T5, T6, T7; + __m128i M0, M1, M2, M3, M4, M5, M6, M7; + __m128i FLT_L, FLT_R, FLT, FS; + __m128i FS3, FS4, FS56; + + __m128i ALPHA = _mm_set1_epi16((short)Alpha); + __m128i BETA = _mm_set1_epi16((short)Beta); + __m128i c_0 = _mm_set1_epi16(0); + __m128i c_1 = _mm_set1_epi16(1); + __m128i c_2 = _mm_set1_epi16(2); + __m128i c_3 = _mm_set1_epi16(3); + __m128i c_4 = _mm_set1_epi16(4); + __m128i c_8 = _mm_set1_epi16(8); + __m128i c_16 = _mm_set1_epi16(16); + + T0 = _mm_loadu_si128((__m128i*)(pTmp)); + T1 = _mm_loadu_si128((__m128i*)(pTmp + stride)); + T2 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2)); + T3 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3)); + T4 = _mm_loadu_si128((__m128i*)(pTmp + stride * 4)); + T5 = _mm_loadu_si128((__m128i*)(pTmp + stride * 5)); + T6 = _mm_loadu_si128((__m128i*)(pTmp + stride * 6)); + T7 = _mm_loadu_si128((__m128i*)(pTmp + stride * 7)); + + M0 = _mm_unpacklo_epi16(T0, T1); + M1 = _mm_unpackhi_epi16(T0, T1); + M2 = _mm_unpacklo_epi16(T2, T3); + M3 = _mm_unpackhi_epi16(T2, T3); + M4 = _mm_unpacklo_epi16(T4, T5); + M5 = _mm_unpackhi_epi16(T4, T5); + M6 = _mm_unpacklo_epi16(T6, T7); + M7 = _mm_unpackhi_epi16(T6, T7); + + T0 = _mm_unpacklo_epi32(M0, M2); + T1 = _mm_unpackhi_epi32(M0, M2); + T2 = _mm_unpacklo_epi32(M1, M3); + T3 = _mm_unpackhi_epi32(M1, M3); + T4 = _mm_unpacklo_epi32(M4, M6); + T5 = _mm_unpackhi_epi32(M4, M6); + T6 = _mm_unpacklo_epi32(M5, M7); + T7 = _mm_unpackhi_epi32(M5, M7); + + TL3 = _mm_unpacklo_epi64(T0, T4); + TL2 = _mm_unpackhi_epi64(T0, T4); + TR0 = _mm_unpacklo_epi64(T2, T6); + TR1 = _mm_unpackhi_epi64(T2, T6); + TL1 = _mm_unpacklo_epi64(T1, T5); + TL0 = _mm_unpackhi_epi64(T1, T5); + TR2 = _mm_unpacklo_epi64(T3, T7); + TR3 = _mm_unpackhi_epi64(T3, T7); + +#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) + + T0 = _mm_subabs_epu16(TL0, TR0); + T1 = _mm_cmpgt_epi16(T0, c_1); + T2 = _mm_cmpgt_epi16(ALPHA, T0); + + M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); + M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 + + T0 = _mm_subabs_epu16(TL1, TL0); + T1 = _mm_subabs_epu16(TR1, TR0); + FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); + FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); + + T0 = _mm_subabs_epu16(TL2, TL0); + T1 = _mm_subabs_epu16(TR2, TR0); + M1 = _mm_cmpgt_epi16(BETA, T0); + M2 = _mm_cmpgt_epi16(BETA, T1); + FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); + FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); + FLT = _mm_add_epi16(FLT_L, FLT_R); + + M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); + T0 = _mm_sub_epi16(FLT, c_2); + T1 = _mm_sub_epi16(FLT, c_3); + T2 = _mm_subabs_epu16(TL1, TR1); + + FS56 = _mm_blendv_epi8(T1, T0, M1); + FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); + FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); + + FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); + + FS = _mm_and_si128(FS, M0); + +#undef _mm_subabs_epu16 + + + TL0l = TL0; + TL1l = TL1; + TR0l = TR0; + TR1l = TR1; + + /* fs == 1 */ + T2 = _mm_add_epi16(_mm_add_epi16(TL0l, TR0l), c_2); // L0 + R0 + 2 + + V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0l, 1), T2), 2); + + V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0l, 1), T2), 2); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); + + /* fs == 2 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 + T3 = _mm_slli_epi16(T3, 1); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 1), _mm_add_epi16(TL1l, TR0l)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 3), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 1), _mm_add_epi16(TR1l, TL0l)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 3), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); + + /* fs == 3 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 + T3 = _mm_slli_epi16(T3, 1); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 2), _mm_add_epi16(TL2, TR1l)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 1), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(T0, 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 2), _mm_add_epi16(TR2, TL1l)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 1), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(T0, 4); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); + + T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0l), _mm_slli_epi16(TL2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1l, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0l, 2)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0l), _mm_slli_epi16(TR2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1l, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0l, 2)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); + TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); + + FS = _mm_cmpeq_epi16(FS, c_4); + + if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */ + TL2l = TL2; + TR2l = TR2; + /* cal L0/R0 */ + T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0l, TL2l), TR0l), 3); + T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0l, TL2l)); + T2 = _mm_add_epi16(_mm_slli_epi16(TR2l, 1), _mm_slli_epi16(TR2l, 2)); + V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); + + T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0l, TR2l), TL0l), 3); + T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0l, TR2l)); + T2 = _mm_add_epi16(_mm_slli_epi16(TL2l, 1), _mm_slli_epi16(TL2l, 2)); + V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); + + TL0 = _mm_blendv_epi8(TL0, V0, FS); + TR0 = _mm_blendv_epi8(TR0, V1, FS); + + /* cal L1/R1 */ + T0 = _mm_slli_epi16(_mm_add_epi16(TL2l, TR0l), 1); + T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0l, 3), TL0l)); + T2 = _mm_add_epi16(_mm_slli_epi16(TL2l, 2), _mm_add_epi16(TR0l, c_8)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); + + T0 = _mm_slli_epi16(_mm_add_epi16(TR2l, TL0l), 1); + T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0l, 3), TR0l)); + T2 = _mm_add_epi16(_mm_slli_epi16(TR2l, 2), _mm_add_epi16(TL0l, c_8)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); + + TL1 = _mm_blendv_epi8(TL1, V2, FS); + TR1 = _mm_blendv_epi8(TR1, V3, FS); + + /* cal L2/R2 */ + T0 = _mm_add_epi16(_mm_slli_epi16(TL2l, 1), TL2l); + T2 = _mm_add_epi16(_mm_slli_epi16(TL0l, 2), TR0l); + V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR2l, 1), TR2l); + T2 = _mm_add_epi16(_mm_slli_epi16(TR0l, 2), TL0l); + V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); + + TL2 = _mm_blendv_epi8(TL2, V4, FS); + TR2 = _mm_blendv_epi8(TR2, V5, FS); + } + + /* store result */ + M0 = _mm_unpacklo_epi16(TL3, TL2); + M1 = _mm_unpackhi_epi16(TL3, TL2); + M2 = _mm_unpacklo_epi16(TL1, TL0); + M3 = _mm_unpackhi_epi16(TL1, TL0); + M4 = _mm_unpacklo_epi16(TR0, TR1); + M5 = _mm_unpackhi_epi16(TR0, TR1); + M6 = _mm_unpacklo_epi16(TR2, TR3); + M7 = _mm_unpackhi_epi16(TR2, TR3); + + T0 = _mm_unpacklo_epi32(M0, M2); + T1 = _mm_unpackhi_epi32(M0, M2); + T2 = _mm_unpacklo_epi32(M1, M3); + T3 = _mm_unpackhi_epi32(M1, M3); + T4 = _mm_unpacklo_epi32(M4, M6); + T5 = _mm_unpackhi_epi32(M4, M6); + T6 = _mm_unpacklo_epi32(M5, M7); + T7 = _mm_unpackhi_epi32(M5, M7); + + M0 = _mm_unpacklo_epi64(T0, T4); + M1 = _mm_unpackhi_epi64(T0, T4); + M4 = _mm_unpacklo_epi64(T2, T6); + M5 = _mm_unpackhi_epi64(T2, T6); + M2 = _mm_unpacklo_epi64(T1, T5); + M3 = _mm_unpackhi_epi64(T1, T5); + M6 = _mm_unpacklo_epi64(T3, T7); + M7 = _mm_unpackhi_epi64(T3, T7); + + pTmp = SrcPtr - 4; + _mm_storeu_si128((__m128i*)(pTmp), M0); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M1); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M2); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M3); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M4); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M5); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M6); + pTmp += stride; + _mm_storeu_si128((__m128i*)(pTmp), M7); +} + +void deblock_edge_hor_sse128(pel_t* SrcPtr, int stride, int Alpha, int Beta, uint8_t* flt_flag) +{ + int inc = stride; + int inc2 = inc << 1; + int inc3 = inc + inc2; + int flag0 = flt_flag[0] ? -1 : 0; + int flag1 = flt_flag[1] ? -1 : 0; + + __m128i TL0, TL1, TL2; + __m128i TR0, TR1, TR2; + __m128i TL0w, TL1w, TL2w, TR0w, TR1w, TR2w; //for write + __m128i V0, V1, V2, V3, V4, V5; + __m128i T0, T1, T2; + __m128i M0, M1, M2; + __m128i FLT_L, FLT_R, FLT, FS; + __m128i FS3, FS4, FS56; + + __m128i ALPHA = _mm_set1_epi16((short)Alpha); + __m128i BETA = _mm_set1_epi16((short)Beta); + __m128i c_0 = _mm_set1_epi16(0); + __m128i c_1 = _mm_set1_epi16(1); + __m128i c_2 = _mm_set1_epi16(2); + __m128i c_3 = _mm_set1_epi16(3); + __m128i c_4 = _mm_set1_epi16(4); + __m128i c_8 = _mm_set1_epi16(8); + __m128i c_16 = _mm_set1_epi16(16); + + TL2 = _mm_loadu_si128((__m128i*)(SrcPtr - inc3)); + TL1 = _mm_loadu_si128((__m128i*)(SrcPtr - inc2)); + TL0 = _mm_loadu_si128((__m128i*)(SrcPtr - inc)); + TR0 = _mm_loadu_si128((__m128i*)(SrcPtr + 0)); + TR1 = _mm_loadu_si128((__m128i*)(SrcPtr + inc)); + TR2 = _mm_loadu_si128((__m128i*)(SrcPtr + inc2)); + +#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) + + T0 = _mm_subabs_epu16(TL0, TR0); + T1 = _mm_cmpgt_epi16(T0, c_1); + T2 = _mm_cmpgt_epi16(ALPHA, T0); + M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); + M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 + + T0 = _mm_subabs_epu16(TL1, TL0); + T1 = _mm_subabs_epu16(TR1, TR0); + FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); + FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); + + T0 = _mm_subabs_epu16(TL2, TL0); + T1 = _mm_subabs_epu16(TR2, TR0); + M1 = _mm_cmpgt_epi16(BETA, T0); + M2 = _mm_cmpgt_epi16(BETA, T1); + FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); + FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); + FLT = _mm_add_epi16(FLT_L, FLT_R); + + M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); + T0 = _mm_subs_epi16(FLT, c_2); + T1 = _mm_subs_epi16(FLT, c_3); + T2 = _mm_subabs_epu16(TL1, TR1); + + FS56 = _mm_blendv_epi8(T1, T0, M1); + FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); + FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); + + FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); + + FS = _mm_and_si128(FS, M0); + +#undef _mm_subabs_epu16 + + TR0w = TR0; + TR1w = TR1; + TL0w = TL0; + TL1w = TL1; + + /* fs == 1 */ + T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 + + V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); + + V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); + + TL0w = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); + TR0w = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); + + /* fs == 2 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_2)); + TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_2)); + + /* fs == 3 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(T0, 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(T0, 4); + + TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_3)); + TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_3)); + + T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + TL1w = _mm_blendv_epi8(TL1w, V2, _mm_cmpeq_epi16(FS, c_3)); + TR1w = _mm_blendv_epi8(TR1w, V3, _mm_cmpeq_epi16(FS, c_3)); + + FS = _mm_cmpeq_epi16(FS, c_4); + + if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */ + /* cal L0/R0 */ + T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0, TL2), TR0), 3); + T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0, TL2)); + T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2)); + V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); + + T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0, TR2), TL0), 3); + T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0, TR2)); + T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2)); + V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); + + TL0w = _mm_blendv_epi8(TL0w, V0, FS); + TR0w = _mm_blendv_epi8(TR0w, V1, FS); + + /* cal L1/R1 */ + T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0), 1); + T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0, 3), TL0)); + T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0, c_8)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); + + T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0), 1); + T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0, 3), TR0)); + T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0, c_8)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); + + TL1w = _mm_blendv_epi8(TL1w, V2, FS); + TR1w = _mm_blendv_epi8(TR1w, V3, FS); + + /* cal L2/R2 */ + T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2); + T2 = _mm_add_epi16(_mm_slli_epi16(TL0, 2), TR0); + V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2); + T2 = _mm_add_epi16(_mm_slli_epi16(TR0, 2), TL0); + V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); + + TL2w = _mm_blendv_epi8(TL2, V4, FS); + TR2w = _mm_blendv_epi8(TR2, V5, FS); + + /* store result */ + _mm_storeu_si128((__m128i*)(SrcPtr - inc), TL0w); + _mm_storeu_si128((__m128i*)(SrcPtr - 0), TR0w); + + _mm_storeu_si128((__m128i*)(SrcPtr - inc2), TL1w); + _mm_storeu_si128((__m128i*)(SrcPtr + inc), TR1w); + + _mm_storeu_si128((__m128i*)(SrcPtr - inc3), TL2w); + _mm_storeu_si128((__m128i*)(SrcPtr + inc2), TR2w); + } + else { + /* store result */ + _mm_storeu_si128((__m128i*)(SrcPtr - inc), TL0w); + _mm_storeu_si128((__m128i*)(SrcPtr - 0), TR0w); + + _mm_storeu_si128((__m128i*)(SrcPtr - inc2), TL1w); + _mm_storeu_si128((__m128i*)(SrcPtr + inc), TR1w); + } + +} + +void deblock_edge_ver_c_sse128(pel_t* SrcPtrU, pel_t* SrcPtrV, int stride, int Alpha, int Beta, uint8_t* flt_flag) +{ + pel_t* pTmp; + int flag0 = flt_flag[0] ? -1 : 0; + int flag1 = flt_flag[1] ? -1 : 0; + + __m128i UVL0, UVL1, UVL2, UVR0, UVR1, UVR2; + __m128i TL0, TL1, TL2, TL3; + __m128i TR0, TR1, TR2, TR3; + __m128i T0, T1, T2, T3, T4, T5, T6, T7; + __m128i V0, V1, V2, V3; + __m128i M0, M1, M2, M3, M4, M5, M6, M7; + __m128i FLT_L, FLT_R, FLT, FS; + __m128i FS4, FS56; + + __m128i ALPHA = _mm_set1_epi16((short)Alpha); + __m128i BETA = _mm_set1_epi16((short)Beta); + __m128i c_0 = _mm_set1_epi16(0); + __m128i c_1 = _mm_set1_epi16(1); + __m128i c_2 = _mm_set1_epi16(2); + __m128i c_3 = _mm_set1_epi16(3); + __m128i c_4 = _mm_set1_epi16(4); + __m128i c_8 = _mm_set1_epi16(8); + + pTmp = SrcPtrU - 4; + T0 = _mm_loadu_si128((__m128i*)(pTmp)); + T1 = _mm_loadu_si128((__m128i*)(pTmp + stride)); + T2 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2)); + T3 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3)); + + pTmp = SrcPtrV - 4; + T4 = _mm_loadu_si128((__m128i*)(pTmp)); + T5 = _mm_loadu_si128((__m128i*)(pTmp + stride)); + T6 = _mm_loadu_si128((__m128i*)(pTmp + stride * 2)); + T7 = _mm_loadu_si128((__m128i*)(pTmp + stride * 3)); + + M0 = _mm_unpacklo_epi16(T0, T1); + M1 = _mm_unpackhi_epi16(T0, T1); + M2 = _mm_unpacklo_epi16(T2, T3); + M3 = _mm_unpackhi_epi16(T2, T3); + M4 = _mm_unpacklo_epi16(T4, T5); + M5 = _mm_unpackhi_epi16(T4, T5); + M6 = _mm_unpacklo_epi16(T6, T7); + M7 = _mm_unpackhi_epi16(T6, T7); + + T0 = _mm_unpacklo_epi32(M0, M2); + T1 = _mm_unpackhi_epi32(M0, M2); + T2 = _mm_unpacklo_epi32(M1, M3); + T3 = _mm_unpackhi_epi32(M1, M3); + T4 = _mm_unpacklo_epi32(M4, M6); + T5 = _mm_unpackhi_epi32(M4, M6); + T6 = _mm_unpacklo_epi32(M5, M7); + T7 = _mm_unpackhi_epi32(M5, M7); + + TL3 = _mm_unpacklo_epi64(T0, T4); + TL2 = _mm_unpackhi_epi64(T0, T4); + TR0 = _mm_unpacklo_epi64(T2, T6); + TR1 = _mm_unpackhi_epi64(T2, T6); + TL1 = _mm_unpacklo_epi64(T1, T5); + TL0 = _mm_unpackhi_epi64(T1, T5); + TR2 = _mm_unpacklo_epi64(T3, T7); + TR3 = _mm_unpackhi_epi64(T3, T7); + +#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) + + T0 = _mm_subabs_epu16(TL0, TR0); + T1 = _mm_cmpgt_epi16(T0, c_1); + T2 = _mm_cmpgt_epi16(ALPHA, T0); + M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); + M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 + + T0 = _mm_subabs_epu16(TL1, TL0); + T1 = _mm_subabs_epu16(TR1, TR0); + FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); + FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); + + T0 = _mm_subabs_epu16(TL2, TL0); + T1 = _mm_subabs_epu16(TR2, TR0); + M1 = _mm_cmpgt_epi16(BETA, T0); + M2 = _mm_cmpgt_epi16(BETA, T1); + FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); + FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); + FLT = _mm_add_epi16(FLT_L, FLT_R); + + M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); + T0 = _mm_sub_epi16(FLT, c_3); + T1 = _mm_sub_epi16(FLT, c_4); + T2 = _mm_subabs_epu16(TL1, TR1); + + FS56 = _mm_blendv_epi8(T1, T0, M1); + FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); + + FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); + + FS = _mm_and_si128(FS, M0); + +#undef _mm_subabs_epu16 + + UVL0 = TL0; + UVL1 = TL1; + UVL2 = TL2; + UVR0 = TR0; + UVR1 = TR1; + UVR2 = TR2; + + /* fs == 1 */ + T2 = _mm_add_epi16(_mm_add_epi16(UVL0, UVR0), c_2); // L0 + R0 + 2 + + V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVL0, 1), T2), 2); + + V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVR0, 1), T2), 2); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); + + /* fs == 2 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 + T3 = _mm_slli_epi16(T3, 1); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 1), _mm_add_epi16(UVL1, UVR0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 3), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 1), _mm_add_epi16(UVR1, UVL0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 3), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); + + /* fs == 3 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 + T3 = _mm_slli_epi16(T3, 1); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 2), _mm_add_epi16(UVL2, UVR1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 1), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(T0, 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 2), _mm_add_epi16(UVR2, UVL1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 1), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(T0, 4); + + TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); + TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); + + T0 = _mm_add_epi16(_mm_add_epi16(UVL2, UVR0), _mm_slli_epi16(UVL2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL0, 2)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + T0 = _mm_add_epi16(_mm_add_epi16(UVR2, UVL0), _mm_slli_epi16(UVR2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR0, 2)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); + TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); + + /* store result */ + M0 = _mm_unpacklo_epi16(TL3, TL2); + M1 = _mm_unpackhi_epi16(TL3, TL2); + M2 = _mm_unpacklo_epi16(TL1, TL0); + M3 = _mm_unpackhi_epi16(TL1, TL0); + M4 = _mm_unpacklo_epi16(TR0, TR1); + M5 = _mm_unpackhi_epi16(TR0, TR1); + M6 = _mm_unpacklo_epi16(TR2, TR3); + M7 = _mm_unpackhi_epi16(TR2, TR3); + + T0 = _mm_unpacklo_epi32(M0, M2); + T1 = _mm_unpackhi_epi32(M0, M2); + T2 = _mm_unpacklo_epi32(M1, M3); + T3 = _mm_unpackhi_epi32(M1, M3); + T4 = _mm_unpacklo_epi32(M4, M6); + T5 = _mm_unpackhi_epi32(M4, M6); + T6 = _mm_unpacklo_epi32(M5, M7); + T7 = _mm_unpackhi_epi32(M5, M7); + + M0 = _mm_unpacklo_epi64(T0, T4); + M1 = _mm_unpackhi_epi64(T0, T4); + M4 = _mm_unpacklo_epi64(T2, T6); + M5 = _mm_unpackhi_epi64(T2, T6); + M2 = _mm_unpacklo_epi64(T1, T5); + M3 = _mm_unpackhi_epi64(T1, T5); + M6 = _mm_unpacklo_epi64(T3, T7); + M7 = _mm_unpackhi_epi64(T3, T7); + + pTmp = SrcPtrU - 4; + _mm_storeu_si128((__m128i*)(pTmp), M0); + _mm_storeu_si128((__m128i*)(pTmp + stride), M1); + _mm_storeu_si128((__m128i*)(pTmp + (stride << 1)), M2); + _mm_storeu_si128((__m128i*)(pTmp + stride * 3), M3); + + pTmp = SrcPtrV - 4; + _mm_storeu_si128((__m128i*)(pTmp), M4); + _mm_storeu_si128((__m128i*)(pTmp + stride), M5); + _mm_storeu_si128((__m128i*)(pTmp + (stride << 1)), M6); + _mm_storeu_si128((__m128i*)(pTmp + stride * 3), M7); +} + +void deblock_edge_hor_c_sse128(pel_t* SrcPtrU, pel_t* SrcPtrV, int stride, int Alpha, int Beta, uint8_t* flt_flag) +{ + int inc = stride; + int inc2 = inc << 1; + int inc3 = inc + inc2; + int flag0 = flt_flag[0] ? -1 : 0; + int flag1 = flt_flag[1] ? -1 : 0; + + __m128i UL0, UL1, UR0, UR1; + __m128i TL0, TL1, TL2; + __m128i TR0, TR1, TR2; + __m128i T0, T1, T2; + __m128i V0, V1, V2, V3; + __m128i M0, M1, M2; + __m128i FLT_L, FLT_R, FLT, FS; + __m128i FS4, FS56; + + __m128i ALPHA = _mm_set1_epi16((short)Alpha); + __m128i BETA = _mm_set1_epi16((short)Beta); + __m128i c_0 = _mm_set1_epi16(0); + __m128i c_1 = _mm_set1_epi16(1); + __m128i c_2 = _mm_set1_epi16(2); + __m128i c_3 = _mm_set1_epi16(3); + __m128i c_4 = _mm_set1_epi16(4); + __m128i c_8 = _mm_set1_epi16(8); + + TL0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc))); + TL1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc2)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc2))); + TL2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU - inc3)), _mm_loadl_epi64((__m128i*)(SrcPtrV - inc3))); + TR0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU)), _mm_loadl_epi64((__m128i*)(SrcPtrV))); + TR1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU + inc)), _mm_loadl_epi64((__m128i*)(SrcPtrV + inc))); + TR2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(SrcPtrU + inc2)), _mm_loadl_epi64((__m128i*)(SrcPtrV + inc2))); + +#define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) + + T0 = _mm_subabs_epu16(TL0, TR0); + T1 = _mm_cmpgt_epi16(T0, c_1); + T2 = _mm_cmpgt_epi16(ALPHA, T0); + + M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); + M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 + + T0 = _mm_subabs_epu16(TL1, TL0); + T1 = _mm_subabs_epu16(TR1, TR0); + FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); + FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); + + T0 = _mm_subabs_epu16(TL2, TL0); + T1 = _mm_subabs_epu16(TR2, TR0); + M1 = _mm_cmpgt_epi16(BETA, T0); + M2 = _mm_cmpgt_epi16(BETA, T1); + FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); + FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); + FLT = _mm_add_epi16(FLT_L, FLT_R); + + M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); + T0 = _mm_subs_epi16(FLT, c_3); + T1 = _mm_subs_epi16(FLT, c_4); + + FS56 = _mm_blendv_epi8(T1, T0, M1); + FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); + + FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); + FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); + + FS = _mm_and_si128(FS, M0); + +#undef _mm_subabs_epu16 + + UR0 = TR0; //UR0 TR0 to store + UR1 = TR1; + UL0 = TL0; + UL1 = TL1; + + /* fs == 1 */ + T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 + + V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); + + V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); + + UL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); + UR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); + + /* fs == 2 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); + + UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_2)); + UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_2)); + + /* fs == 3 */ + T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 + + T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); + + V0 = _mm_srli_epi16(T0, 4); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); + + T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); + + V1 = _mm_srli_epi16(T0, 4); + + UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_3)); + UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_3)); + + T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); + V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); + T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); + V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); + + UL1 = _mm_blendv_epi8(UL1, V2, _mm_cmpeq_epi16(FS, c_3)); + UR1 = _mm_blendv_epi8(UR1, V3, _mm_cmpeq_epi16(FS, c_3)); + + /* store result */ + + ((int64_t*)(SrcPtrU - inc))[0] = _mm_extract_epi64(UL0, 0); + ((int64_t*)(SrcPtrU))[0] = _mm_extract_epi64(UR0, 0); + ((int64_t*)(SrcPtrU - inc2))[0] = _mm_extract_epi64(UL1, 0); + ((int64_t*)(SrcPtrU + inc))[0] = _mm_extract_epi64(UR1, 0); + ((int64_t*)(SrcPtrV - inc))[0] = _mm_extract_epi64(UL0, 1); + ((int64_t*)(SrcPtrV))[0] = _mm_extract_epi64(UR0, 1); + ((int64_t*)(SrcPtrV - inc2))[0] = _mm_extract_epi64(UL1, 1); + ((int64_t*)(SrcPtrV + inc))[0] = _mm_extract_epi64(UR1, 1); +} +#endif // #if !HIGH_BIT_DEPTH diff --git a/source/common/vec/intrinsic_idct.c b/source/common/vec/intrinsic_idct.c index 31f4c97..8c24e51 100644 --- a/source/common/vec/intrinsic_idct.c +++ b/source/common/vec/intrinsic_idct.c @@ -36,6 +36,7 @@ #include "../basic_types.h" #include "../avs2_defs.h" +#include "../common.h" #include "intrinsic.h" #include @@ -65,12 +66,13 @@ extern ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]); /* --------------------------------------------------------------------------- */ -void idct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_4x4_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth; + const int shift2 = 20 - h->param->input_sample_bit_depth; // const int clip_depth1 = LIMIT_BIT; - const int clip_depth2 = g_bit_depth + 1; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); @@ -145,12 +147,13 @@ void idct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_4x16_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth; + const int shift2 = 20 - h->param->input_sample_bit_depth; // const int clip_depth1 = LIMIT_BIT; - const int clip_depth2 = g_bit_depth + 1; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); @@ -449,12 +452,13 @@ void idct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_16x4_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth; + const int shift2 = 20 - h->param->input_sample_bit_depth; // const int clip_depth1 = LIMIT_BIT; - const int clip_depth2 = g_bit_depth + 1; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); @@ -717,12 +721,13 @@ void idct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_8x8_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { // const int shift1 = 5; - const int shift2 = 20 - g_bit_depth; + const int shift2 = 20 - h->param->input_sample_bit_depth; // const int clip_depth1 = LIMIT_BIT; - const int clip_depth2 = g_bit_depth + 1; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; __m128i S0, S1, S2, S3, S4, S5, S6, S7; __m128i mAdd, T0, T1, T2, T3; @@ -815,7 +820,7 @@ void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) E2l = _mm_add_epi32(E2l, mAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, mAdd); - S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // 首次反变换移位数 + S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // 棣栨鍙嶅彉鎹㈢Щ浣嶆暟 S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); @@ -987,12 +992,13 @@ void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_16x16_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth; + const int shift2 = 20 - h->param->input_sample_bit_depth; //const int clip_depth1 = LIMIT_BIT; - const int clip_depth2 = g_bit_depth + 1; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); @@ -1311,7 +1317,7 @@ void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - + TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) @@ -1415,13 +1421,14 @@ void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_32x32_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { int a_flag = i_dst & 0x01; //int shift1 = 5; - int shift2 = 20 - g_bit_depth - a_flag; + int shift2 = 20 - h->param->input_sample_bit_depth - a_flag; //int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + a_flag; + int clip_depth2 = h->param->input_sample_bit_depth + 1 + a_flag; const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); @@ -2206,7 +2213,8 @@ void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_32x8_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { __m128i m128iS0[4], m128iS1[4], m128iS2[4], m128iS3[4], m128iS4[4], m128iS5[4], m128iS6[4], m128iS7[4]; __m128i m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3; @@ -2214,9 +2222,9 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l; __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; //int shift1 = 5; - int shift2 = 20 - g_bit_depth - (i_dst & 0x01); + int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); + int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01); int i, pass; i_dst &= 0xFE; /* remember to remove the flag bit */ @@ -2305,7 +2313,7 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); - m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // 首次反变换移位数 + m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // 棣栨鍙嶅彉鎹㈢Щ浣嶆暟 m128iS7[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); m128iS1[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); m128iS6[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); @@ -3044,7 +3052,8 @@ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) /* --------------------------------------------------------------------------- */ -void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_8x32_sse128(xavs2_t *h, + const coeff_t *src, coeff_t *dst, int i_dst) { const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); @@ -3228,9 +3237,9 @@ void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) int nShift = 5, pass; //int shift1 = 5; - int shift2 = 20 - g_bit_depth - (i_dst & 0x01); + int shift2 = 20 - h->param->input_sample_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; - int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); + int clip_depth2 = h->param->input_sample_bit_depth + 1 + (i_dst & 0x01); // DCT1 __m128i in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15; @@ -3805,7 +3814,7 @@ void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) E2l = _mm_add_epi32(E2l, c32_rnd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, c32_rnd); - in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift)); // 首次反变换移位数 + in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift)); // 棣栨鍙嶅彉鎹㈢Щ浣嶆暟 in07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), nShift)); in01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), nShift)); in06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), nShift)); @@ -3966,11 +3975,12 @@ void inv_transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top /* --------------------------------------------------------------------------- */ -void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) +void inv_transform_4x4_2nd_sse128(xavs2_t *h, + coeff_t *coeff, int i_coeff) { const int shift1 = 5; - const int shift2 = 20 - g_bit_depth + 2; - const int clip_depth2 = g_bit_depth + 1; + const int shift2 = 20 - h->param->input_sample_bit_depth + 2; + const int clip_depth2 = h->param->input_sample_bit_depth + 1; /*---vertical transform first---*/ __m128i factor = _mm_set1_epi32(1 << (shift1 - 1)); // add1 @@ -4062,23 +4072,23 @@ void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - + #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \ TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \ TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \ TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \ TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \ - + /* --------------------------------------------------------------------------- */ static void inv_wavelet_64x64_sse128(coeff_t *coeff) { int i; - //按行 64*64 + //鎸夎 64*64 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8]; - //按列 16*64 + //鎸夊垪 16*64 __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; @@ -4357,10 +4367,10 @@ static void inv_wavelet_64x64_sse128(coeff_t *coeff) static void inv_wavelet_64x16_sse128(coeff_t *coeff) { int i; - //按行 64*16 + //鎸夎 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; - //按列 16*64 + //鎸夊垪 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; @@ -4573,10 +4583,10 @@ static void inv_wavelet_16x64_sse128(coeff_t *coeff) __m128i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31; __m128i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63; - //按行 64*16 + //鎸夎 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; - //按列 16*64 + //鎸夊垪 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; @@ -4767,27 +4777,27 @@ static void inv_wavelet_16x64_sse128(coeff_t *coeff) /* --------------------------------------------------------------------------- */ -void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_64x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_c_32x32_sse128(src, dst, 32 | 0x01); /* 32x32 idct */ + idct_c_32x32_sse128(h, src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_sse128(dst); } /* --------------------------------------------------------------------------- */ -void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_64x16_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_c_32x8_sse128(src, dst, 32 | 0x01); + idct_c_32x8_sse128(h, src, dst, 32 | 0x01); inv_wavelet_64x16_sse128(dst); } /* --------------------------------------------------------------------------- */ -void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) +void idct_c_16x64_sse128(xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); - idct_c_8x32_sse128(src, dst, 8 | 0x01); + idct_c_8x32_sse128(h, src, dst, 8 | 0x01); inv_wavelet_16x64_sse128(dst); } diff --git a/source/common/vec/intrinsic_inter_pred.c b/source/common/vec/intrinsic_inter_pred.c index 7dddf86..55957d8 100644 --- a/source/common/vec/intrinsic_inter_pred.c +++ b/source/common/vec/intrinsic_inter_pred.c @@ -44,9 +44,11 @@ #include "intrinsic.h" #include "avs2_defs.h" +#if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- */ -void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) +void intpl_chroma_block_hor_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const int16_t offset = 32; const int shift = 6; @@ -95,7 +97,8 @@ void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, /* --------------------------------------------------------------------------- */ -void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) +void intpl_luma_block_hor_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col = 0; const short offset = 32; @@ -154,7 +157,8 @@ void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i /* --------------------------------------------------------------------------- */ -void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +void intpl_luma_hor_sse128(xavs2_t *h, + pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col = 0; const short offset = 32; @@ -270,9 +274,10 @@ void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t * } /* --------------------------------------------------------------------------- - * TODO: @luofl 20170827 按照 intpl_luma_hor_sse128() 改写,依次插值16列 + * TODO: @luofl 20170827 鎸夌収 intpl_luma_hor_sse128() 鏀瑰啓锛屼緷娆℃彃鍊16鍒 */ -void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) +void intpl_luma_hor_x3_sse128(xavs2_t *h, + pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int row, col = 0; const short offset = 32; @@ -557,7 +562,8 @@ void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3 result = _mm_packus_epi16(mVal1, mVal1); -void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) +void intpl_luma_ver_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col; const short offset = 32; @@ -691,7 +697,8 @@ void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int wid /* --------------------------------------------------------------------------- * */ -void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) +void intpl_luma_ver_x3_sse128(xavs2_t *h, + pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) { /* intpl_luma_ver_sse128(dst0, i_dst, src, i_src, width, height, coeff[0]); @@ -956,7 +963,8 @@ void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_ /* --------------------------------------------------------------------------- */ -void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) +void intpl_luma_ext_sse128(xavs2_t *h, + pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { int row, col; int shift; @@ -1188,7 +1196,8 @@ void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int wid } } -void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) +void intpl_luma_ext_x3_sse128(xavs2_t *h, + pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { /* intpl_luma_ext_sse128(dst0, i_dst, tmp, i_tmp, width, height, coeff[0]); @@ -1581,7 +1590,8 @@ void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_ /* --------------------------------------------------------------------------- */ -void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) +void intpl_chroma_block_ver_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const short offset = 32; @@ -1704,7 +1714,8 @@ void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, /* --------------------------------------------------------------------------- */ -void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) +void intpl_luma_block_ver_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const short offset = 32; const int shift = 6; @@ -1839,7 +1850,8 @@ void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i /* --------------------------------------------------------------------------- */ -void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) +void intpl_chroma_block_ext_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; @@ -2074,7 +2086,8 @@ void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, /* --------------------------------------------------------------------------- */ -void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) +void intpl_luma_block_ext_sse128(xavs2_t *h, + pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; @@ -2312,4 +2325,4 @@ void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, i } } } - +#endif diff --git a/source/common/vec/intrinsic_intra-filledge.c b/source/common/vec/intrinsic_intra-filledge.c index 6776a5d..1bd8e3e 100644 --- a/source/common/vec/intrinsic_intra-filledge.c +++ b/source/common/vec/intrinsic_intra-filledge.c @@ -36,6 +36,7 @@ #include "../avs2_defs.h" #include "../basic_types.h" +#include "../common.h" #include "intrinsic.h" #include @@ -45,11 +46,13 @@ #include +#if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在左边界上的PU + * LCU鍐呭湪宸﹁竟鐣屼笂鐨凱U */ -void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_edge_samples_0_sse128(xavs2_t *h, + const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; @@ -60,12 +63,12 @@ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; - T0 = _mm_set1_epi8((uint8_t)g_dc_value); + T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1)); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } - memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); - EP[2 * bsx] = (pel_t)g_dc_value; + memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1); + EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -162,9 +165,10 @@ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在左边界上的PU + * LCU鍐呭湪宸﹁竟鐣屼笂鐨凱U */ -void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_edge_samples_x_sse128(xavs2_t *h, + const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; @@ -175,12 +179,12 @@ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; - T0 = _mm_set1_epi8((uint8_t)g_dc_value); + T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1)); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } - memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); - EP[2 * bsx] = (pel_t)g_dc_value; + memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1); + EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -287,9 +291,10 @@ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在左边界上的PU + * LCU鍐呭湪宸﹁竟鐣屼笂鐨凱U */ -void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_edge_samples_y_sse128(xavs2_t *h, + const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; @@ -301,12 +306,12 @@ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; - T0 = _mm_set1_epi8((uint8_t)g_dc_value); + T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1)); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } - memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); - EP[2 * bsx] = (pel_t)g_dc_value; + memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1); + EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -403,9 +408,10 @@ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, /* --------------------------------------------------------------------------- * fill reference samples for intra prediction - * LCU内在左边界上的PU + * LCU鍐呭湪宸﹁竟鐣屼笂鐨凱U */ -void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) +void fill_edge_samples_xy_sse128(xavs2_t *h, + const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; @@ -418,12 +424,12 @@ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; - T0 = _mm_set1_epi8((uint8_t)g_dc_value); + T0 = _mm_set1_epi8((uint8_t)((1 << h->param->input_sample_bit_depth) >> 1)); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } - memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); - EP[2 * bsx] = (pel_t)g_dc_value; + memset(&EP[-(bsy << 1)] + j, ((1 << h->param->input_sample_bit_depth) >> 1), k - j + 1); + EP[2 * bsx] = (pel_t)((1 << h->param->input_sample_bit_depth) >> 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels @@ -527,5 +533,5 @@ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP EP[0] = pL[0]; } } - +#endif diff --git a/source/common/vec/intrinsic_intra-pred.c b/source/common/vec/intrinsic_intra-pred.c index d807f6d..ac13b20 100644 --- a/source/common/vec/intrinsic_intra-pred.c +++ b/source/common/vec/intrinsic_intra-pred.c @@ -36,6 +36,7 @@ #include "../avs2_defs.h" #include "../basic_types.h" +#include "../common.h" #include "intrinsic.h" #include #include @@ -44,6 +45,7 @@ #include +#if !HIGH_BIT_DEPTH static ALIGN16(int8_t tab_coeff_mode_5[8][16]) = { { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, @@ -63,7 +65,8 @@ static uint8_t tab_idx_mode_5[64] = { /* --------------------------------------------------------------------------- */ -void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ver_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src + 1; @@ -124,7 +127,8 @@ void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int /* --------------------------------------------------------------------------- */ -void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_hor_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src - 1; @@ -179,7 +183,8 @@ void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int /* --------------------------------------------------------------------------- */ -void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_dc_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int avail_above = dir_mode >> 8; int avail_left = dir_mode & 0xFF; @@ -251,7 +256,7 @@ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int b } else if (avail_above) { dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx); } else { - dc_value = g_dc_value; + dc_value = ((1 << h->param->input_sample_bit_depth) >> 1); } p00 = _mm_set1_epi8((pel_t)dc_value); @@ -272,7 +277,8 @@ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int b /* --------------------------------------------------------------------------- */ -void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_plane_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *rpSrc; int iH = 0; @@ -356,7 +362,8 @@ void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, in /* --------------------------------------------------------------------------- */ -void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_bilinear_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int x, y; int ishift_x = tab_log2[bsx]; @@ -543,7 +550,8 @@ void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_3_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); @@ -1111,7 +1119,8 @@ void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_4_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); @@ -1244,7 +1253,8 @@ void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_5_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); @@ -1977,7 +1987,8 @@ void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_6_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; @@ -2194,7 +2205,8 @@ void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_7_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; @@ -2521,7 +2533,8 @@ void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_8_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + (bsy >> 1) - 1; @@ -2702,7 +2715,8 @@ void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_9_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; @@ -3030,7 +3044,8 @@ void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_10_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; pel_t *dst1 = dst; @@ -3526,7 +3541,8 @@ void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_x_11_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j, idx; __m128i zero = _mm_setzero_si128(); @@ -3663,7 +3679,8 @@ void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_25_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; @@ -4152,7 +4169,8 @@ void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_26_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; @@ -4576,7 +4594,8 @@ void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_28_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + (bsy - 1) * 2; @@ -4717,7 +4736,8 @@ void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_30_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; @@ -4956,7 +4976,8 @@ void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_31_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t dst_tran[64 * 80]); ALIGN16(pel_t src_tran[64 * 8]); @@ -4970,7 +4991,7 @@ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, src_tran[i] = src[-i]; } - intra_pred_ang_x_5_sse128(src_tran, dst_tran, i_dst2, 5, bsy, bsx); + intra_pred_ang_x_5_sse128(h, src_tran, dst_tran, i_dst2, 5, bsy, bsx); if ((bsy > 4) && (bsx > 4)) { pel_t *pDst_128[64]; @@ -5212,7 +5233,8 @@ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_y_32_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 64)]); int line_size = (bsy >> 1) + bsx - 1; @@ -5367,7 +5389,8 @@ void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_13_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); @@ -6383,7 +6406,8 @@ void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_14_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; __m128i coeff2 = _mm_set1_epi16(2); @@ -6453,7 +6477,7 @@ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00); } - if (i < left_size) { //使用c语言可能会更优 + if (i < left_size) { //浣跨敤c璇█鍙兘浼氭洿浼 __m128i p00, p01, p10; __m128i p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); @@ -6755,7 +6779,8 @@ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_16_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + bsy / 2 - 1; @@ -6925,7 +6950,8 @@ void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_18_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; @@ -7016,7 +7042,8 @@ void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_20_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int left_size = (bsy - 1) * 2 + 1; @@ -7188,7 +7215,8 @@ void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_22_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; @@ -7464,7 +7492,8 @@ void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode /* --------------------------------------------------------------------------- */ -void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) +void intra_pred_ang_xy_23_sse128(xavs2_t *h, + pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; @@ -7924,7 +7953,527 @@ void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode *((int*)dst) = _mm_cvtsi128_si32(M7); } } +} +#else + /***************************************************************************** + * Copyright (C) 2016 uavs2dec project, + * National Engineering Laboratory for Video Technology(Shenzhen), + * Digital Media R&D Center at Peking University Shenzhen Graduate School, China + * Project Leader: Ronggang Wang + * + * Main Authors: Zhenyu Wang , Kui Fan + * Shenghao Zhang <1219759986@qq.com>艁卢 Bingjie Han, Kaili Yao, Hongbin Cao, Yueming Wang, + * Jing Su, Jiaying Yan, Junru Li + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at rgwang@pkusz.edu.cn. + *****************************************************************************/ + +void intra_pred_ver_sse128(xavs2_t *h, + pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight) +{ + int y; + pel_t* rpSrc = pSrc + 1; + __m128i T1, T2, T3, T4; + __m128i M1, M2, M3, M4; + UNUSED_PARAMETER(dir_mode); + + switch (iWidth) { + case 4: + for (y = 0; y < iHeight; y += 2) { + CP64(dst, rpSrc); + CP64(dst + i_dst, rpSrc); + dst += i_dst << 1; + } + break; + case 8: + T1 = _mm_loadu_si128((__m128i*)rpSrc); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst), T1); + dst += i_dst; + } + break; + case 16: + T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); + T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8)); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T1); + _mm_storeu_si128((__m128i*)(dst + 8), T2); + dst += i_dst; + } + break; + case 32: + T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); + T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8)); + T3 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); + T4 = _mm_loadu_si128((__m128i*)(rpSrc + 24)); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T1); + _mm_storeu_si128((__m128i*)(dst + 8), T2); + _mm_storeu_si128((__m128i*)(dst + 16), T3); + _mm_storeu_si128((__m128i*)(dst + 24), T4); + dst += i_dst; + } + break; + case 64: + T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); + T2 = _mm_loadu_si128((__m128i*)(rpSrc + 8)); + T3 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); + T4 = _mm_loadu_si128((__m128i*)(rpSrc + 24)); + M1 = _mm_loadu_si128((__m128i*)(rpSrc + 32)); + M2 = _mm_loadu_si128((__m128i*)(rpSrc + 40)); + M3 = _mm_loadu_si128((__m128i*)(rpSrc + 48)); + M4 = _mm_loadu_si128((__m128i*)(rpSrc + 56)); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T1); + _mm_storeu_si128((__m128i*)(dst + 8), T2); + _mm_storeu_si128((__m128i*)(dst + 16), T3); + _mm_storeu_si128((__m128i*)(dst + 24), T4); + _mm_storeu_si128((__m128i*)(dst + 32), M1); + _mm_storeu_si128((__m128i*)(dst + 40), M2); + _mm_storeu_si128((__m128i*)(dst + 48), M3); + _mm_storeu_si128((__m128i*)(dst + 56), M4); + dst += i_dst; + } + break; + default: + assert(0); + break; + } +} + +void intra_pred_hor_sse128(xavs2_t *h, + pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight) +{ + int y; + pel_t* rpSrc = pSrc - 1; + __m128i T; + + UNUSED_PARAMETER(dir_mode); + + switch (iWidth) { + case 4: + for (y = 0; y < iHeight; y++) { + M64(dst) = 0x0001000100010001 * rpSrc[-y]; + dst += i_dst; + } + break; + case 8: + for (y = 0; y < iHeight; y++) { + T = _mm_set1_epi16((pel_t)rpSrc[-y]); + _mm_storeu_si128((__m128i*)(dst), T); + dst += i_dst; + } + break; + case 16: + for (y = 0; y < iHeight; y++) { + T = _mm_set1_epi16((pel_t)rpSrc[-y]); + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + dst += i_dst; + } + break; + case 32: + for (y = 0; y < iHeight; y++) { + T = _mm_set1_epi16((pel_t)rpSrc[-y]); + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + _mm_storeu_si128((__m128i*)(dst + 16), T); + _mm_storeu_si128((__m128i*)(dst + 24), T); + dst += i_dst; + } + break; + case 64: + for (y = 0; y < iHeight; y++) { + T = _mm_set1_epi16((pel_t)rpSrc[-y]); + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + _mm_storeu_si128((__m128i*)(dst + 16), T); + _mm_storeu_si128((__m128i*)(dst + 24), T); + _mm_storeu_si128((__m128i*)(dst + 32), T); + _mm_storeu_si128((__m128i*)(dst + 40), T); + _mm_storeu_si128((__m128i*)(dst + 48), T); + _mm_storeu_si128((__m128i*)(dst + 56), T); + dst += i_dst; + } + break; + default: + assert(0); + break; + } +} + +void intra_pred_plane_sse128(xavs2_t *h, + pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight) +{ + pel_t* rpSrc; + int iH = 0; + int iV = 0; + int iA, iB, iC; + int x, y; + int iW2 = iWidth >> 1; + int iH2 = iHeight >> 1; + int ib_mult[5] = { 13, 17, 5, 11, 23 }; + int ib_shift[5] = { 7, 10, 11, 15, 19 }; + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i max_val = _mm_set1_epi16((pel_t)max_pixel); + + int im_h = ib_mult[tab_log2[iWidth] - 2]; + int is_h = ib_shift[tab_log2[iWidth] - 2]; + int im_v = ib_mult[tab_log2[iHeight] - 2]; + int is_v = ib_shift[tab_log2[iHeight] - 2]; + + int iTmp; + __m128i TC, TB, TA, T_Start, T, D, D1; + + UNUSED_PARAMETER(dir_mode); + + rpSrc = pSrc + iW2; + for (x = 1; x < iW2 + 1; x++) { + iH += x * (rpSrc[x] - rpSrc[-x]); + } + + rpSrc = pSrc - iH2; + for (y = 1; y < iH2 + 1; y++) { + iV += y * (rpSrc[-y] - rpSrc[y]); + } + + iA = (pSrc[-1 - (iHeight - 1)] + pSrc[1 + iWidth - 1]) << 4; + iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; + iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; + + iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; + + TA = _mm_set1_epi32((int16_t)iTmp); + TB = _mm_set1_epi32((int16_t)iB); + TC = _mm_set1_epi32((int16_t)iC); + + T_Start = _mm_set_epi32(3, 2, 1, 0); + T_Start = _mm_mullo_epi32(TB, T_Start); + T_Start = _mm_add_epi32(T_Start, TA); + + TB = _mm_slli_epi32(TB, 2); + + if (iWidth <= 4) { + for (y = 0; y < iHeight; y++) { + D = _mm_srai_epi32(T_Start, 5); + D = _mm_packus_epi32(D, D); + D = _mm_min_epu16(D, max_val); + _mm_storel_epi64((__m128i*)dst, D); + T_Start = _mm_add_epi32(T_Start, TC); + dst += i_dst; + } + } + else + { + for (y = 0; y < iHeight; y++) { + T = T_Start; + for (x = 0; x < iWidth; x += 8) { + D = _mm_srai_epi32(T, 5); + T = _mm_add_epi32(T, TB); + D1 = _mm_srai_epi32(T, 5); + T = _mm_add_epi32(T, TB); + D = _mm_packus_epi32(D, D1); + D = _mm_min_epu16(D, max_val); + _mm_storeu_si128((__m128i*)(dst + x), D); + } + T_Start = _mm_add_epi32(T_Start, TC); + dst += i_dst; + } + } +} + +void intra_pred_bilinear_sse128(xavs2_t *h, + pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight) +{ + int x, y; + int ishift_x = tab_log2[iWidth]; + int ishift_y = tab_log2[iHeight]; + int ishift = min(ishift_x, ishift_y); + int ishift_xy = ishift_x + ishift_y + 1; + int offset = 1 << (ishift_x + ishift_y); + int a, b, c, w, val; + pel_t* p; + __m128i T, T1, T2, T3, C1, C2, ADD; + __m128i ZERO = _mm_setzero_si128(); + __m128i shuff = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i max_val = _mm_set1_epi16((pel_t)max_pixel); + + ALIGN16(int16_t pTop[MAX_CU_SIZE + 16]); + ALIGN16(int16_t pLeft[MAX_CU_SIZE + 16]); + ALIGN16(int16_t pT[MAX_CU_SIZE + 16]); + ALIGN16(int16_t pL[MAX_CU_SIZE + 16]); + ALIGN16(int16_t wy[MAX_CU_SIZE + 16]); + + UNUSED_PARAMETER(dir_mode); + + a = pSrc[iWidth]; + b = pSrc[-iHeight]; + + c = (iWidth == iHeight) ? (a + b + 1) >> 1 : (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); + w = (c << 1) - a - b; + + T = _mm_set1_epi16((int16_t)b); + p = pSrc + 1; + + for (x = 0; x < iWidth; x += 8) { + T1 = _mm_loadu_si128((__m128i*)(p + x)); + T2 = _mm_sub_epi16(T, T1); + T1 = _mm_slli_epi16(T1, ishift_y); + _mm_store_si128((__m128i*)(pT + x), T2); + _mm_store_si128((__m128i*)(pTop + x), T1); + } + + T = _mm_set1_epi16((int16_t)a); + p = pSrc - 8; + + for (y = 0; y < iHeight; y += 8) { + T1 = _mm_loadu_si128((__m128i*)(p - y)); + T1 = _mm_shuffle_epi8(T1, shuff); + T2 = _mm_sub_epi16(T, T1); + T1 = _mm_slli_epi16(T1, ishift_x); + _mm_store_si128((__m128i*)(pL + y), T2); + _mm_store_si128((__m128i*)(pLeft + y), T1); + } + + T = _mm_set1_epi16((int16_t)w); + T = _mm_mullo_epi16(T, _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); + T1 = _mm_set1_epi16((int16_t)(8 * w)); + + for (y = 0; y < iHeight; y += 8) { + _mm_store_si128((__m128i*)(wy + y), T); + T = _mm_add_epi16(T, T1); + } + + C1 = _mm_set_epi32(3, 2, 1, 0); + C2 = _mm_set1_epi32(4); + + if (iWidth == 4) { + __m128i pTT = _mm_loadl_epi64((__m128i*)pT); + T = _mm_loadl_epi64((__m128i*)pTop); + for (y = 0; y < iHeight; y++) { + int add = (pL[y] << ishift_y) + wy[y]; + ADD = _mm_set1_epi32(add); + ADD = _mm_mullo_epi32(C1, ADD); + + val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); + + ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); + T = _mm_add_epi16(T, pTT); + + T1 = _mm_cvtepi16_epi32(T); + T1 = _mm_slli_epi32(T1, ishift_x); + + T1 = _mm_add_epi32(T1, ADD); + T1 = _mm_srai_epi32(T1, ishift_xy); + + T1 = _mm_packus_epi32(T1, T1); + T1 = _mm_min_epu16(T1, max_val); + _mm_storel_epi64((__m128i*)dst, T1); + + dst += i_dst; + } + } + else if (iWidth == 8) { + __m128i pTT = _mm_load_si128((__m128i*)pT); + T = _mm_load_si128((__m128i*)pTop); + for (y = 0; y < iHeight; y++) { + int add = (pL[y] << ishift_y) + wy[y]; + ADD = _mm_set1_epi32(add); + T3 = _mm_mullo_epi32(C2, ADD); + ADD = _mm_mullo_epi32(C1, ADD); + + val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); + + ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); + + T = _mm_add_epi16(T, pTT); + + T1 = _mm_cvtepi16_epi32(T); + T2 = _mm_cvtepi16_epi32(_mm_srli_si128(T, 8)); + T1 = _mm_slli_epi32(T1, ishift_x); + T2 = _mm_slli_epi32(T2, ishift_x); + + T1 = _mm_add_epi32(T1, ADD); + T1 = _mm_srai_epi32(T1, ishift_xy); + ADD = _mm_add_epi32(ADD, T3); + + T2 = _mm_add_epi32(T2, ADD); + T2 = _mm_srai_epi32(T2, ishift_xy); + ADD = _mm_add_epi32(ADD, T3); + + T1 = _mm_packus_epi32(T1, T2); + T1 = _mm_min_epu16(T1, max_val); + _mm_storeu_si128((__m128i*)dst, T1); + + dst += i_dst; + } + } + else { + __m128i TT[16]; + __m128i PTT[16]; + for (x = 0; x < iWidth; x += 8) { + int idx = x >> 2; + __m128i M0 = _mm_load_si128((__m128i*)(pTop + x)); + __m128i M1 = _mm_load_si128((__m128i*)(pT + x)); + TT[idx] = _mm_unpacklo_epi16(M0, ZERO); + TT[idx + 1] = _mm_unpackhi_epi16(M0, ZERO); + PTT[idx] = _mm_cvtepi16_epi32(M1); + PTT[idx + 1] = _mm_cvtepi16_epi32(_mm_srli_si128(M1, 8)); + } + for (y = 0; y < iHeight; y++) { + int add = (pL[y] << ishift_y) + wy[y]; + ADD = _mm_set1_epi32(add); + T3 = _mm_mullo_epi32(C2, ADD); + ADD = _mm_mullo_epi32(C1, ADD); + + val = ((uint16_t)pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); + + ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); + + for (x = 0; x < iWidth; x += 8) { + int idx = x >> 2; + TT[idx] = _mm_add_epi32(TT[idx], PTT[idx]); + TT[idx + 1] = _mm_add_epi32(TT[idx + 1], PTT[idx + 1]); + + T1 = _mm_slli_epi32(TT[idx], ishift_x); + T2 = _mm_slli_epi32(TT[idx + 1], ishift_x); + + T1 = _mm_add_epi32(T1, ADD); + T1 = _mm_srai_epi32(T1, ishift_xy); + ADD = _mm_add_epi32(ADD, T3); + + T2 = _mm_add_epi32(T2, ADD); + T2 = _mm_srai_epi32(T2, ishift_xy); + ADD = _mm_add_epi32(ADD, T3); + + T1 = _mm_packus_epi32(T1, T2); + T1 = _mm_min_epu16(T1, max_val); + _mm_storeu_si128((__m128i*)(dst + x), T1); + } + dst += i_dst; + } + } +} + +void intra_pred_dc_sse128(xavs2_t *h, + pel_t* pSrc, pel_t* dst, int i_dst, int dir_mode, int iWidth, int iHeight) +{ + int bAboveAvail = dir_mode >> 8; + int bLeftAvail = dir_mode & 0xFF; + + int x, y; + int iDCValue = 0; + pel_t* rpSrc = pSrc - 1; + int h_bitsize = tab_log2[iHeight]; + int w_bitsize = tab_log2[iWidth]; + int half_height = iHeight >> 1; + int half_width = iWidth >> 1; + __m128i T; + uint64_t v64; + + if (bLeftAvail) { + for (y = 0; y < iHeight; y++) { + iDCValue += rpSrc[-y]; + } + + rpSrc = pSrc + 1; + if (bAboveAvail) { + for (x = 0; x < iWidth; x++) { + iDCValue += rpSrc[x]; + } + + iDCValue += ((iWidth + iHeight) >> 1); + iDCValue = (iDCValue * (512 / (iWidth + iHeight))) >> 9; + } + else { + iDCValue += half_height; + iDCValue >>= h_bitsize; + } + } + else { + rpSrc = pSrc + 1; + if (bAboveAvail) { + for (x = 0; x < iWidth; x++) { + iDCValue += rpSrc[x]; + } + + iDCValue += half_width; + iDCValue >>= w_bitsize; + } + else { + iDCValue = 1 << (h->param->input_sample_bit_depth - 1); + } + } + + switch (iWidth) { + case 4: + v64 = 0x0001000100010001 * iDCValue; + for (y = 0; y < iHeight; y++) { + M64(dst) = v64; + dst += i_dst; + } + break; + case 8: + T = _mm_set1_epi16((pel_t)iDCValue); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst), T); + dst += i_dst; + } + break; + case 16: + T = _mm_set1_epi16((pel_t)iDCValue); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + dst += i_dst; + } + break; + case 32: + T = _mm_set1_epi16((pel_t)iDCValue); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + _mm_storeu_si128((__m128i*)(dst + 16), T); + _mm_storeu_si128((__m128i*)(dst + 24), T); + dst += i_dst; + } + break; + case 64: + T = _mm_set1_epi16((pel_t)iDCValue); + for (y = 0; y < iHeight; y++) { + _mm_storeu_si128((__m128i*)(dst + 0), T); + _mm_storeu_si128((__m128i*)(dst + 8), T); + _mm_storeu_si128((__m128i*)(dst + 16), T); + _mm_storeu_si128((__m128i*)(dst + 24), T); + _mm_storeu_si128((__m128i*)(dst + 32), T); + _mm_storeu_si128((__m128i*)(dst + 40), T); + _mm_storeu_si128((__m128i*)(dst + 48), T); + _mm_storeu_si128((__m128i*)(dst + 56), T); + dst += i_dst; + } + break; + default: + assert(0); + break; + } } +#endif // #if !HIGH_BIT_DEPTH diff --git a/source/common/vec/intrinsic_pixel.c b/source/common/vec/intrinsic_pixel.c index 9c5a9fa..bc97d5b 100644 --- a/source/common/vec/intrinsic_pixel.c +++ b/source/common/vec/intrinsic_pixel.c @@ -45,6 +45,39 @@ void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height) { +#if HIGH_BIT_DEPTH + int j; + __m128i D; + + if (width & 7) { + //__m128i mask = _mm_load_si128((const __m128i *)intrinsic_mask_10bit[(width & 7) - 1]); + __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(width & 7) - 1]); + + while (height--) { + for (j = 0; j < width - 7; j += 8) { + D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j))); + _mm_storeu_si128((__m128i *)(dst + j), D); + } + + D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j))); + _mm_maskmoveu_si128(D, mask, (char *)&dst[j]); + + src1 += i_src1; + src2 += i_src2; + dst += i_dst; + } + } else { + while (height--) { + for (j = 0; j < width; j += 8) { + D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src1 + j)), _mm_loadu_si128((const __m128i *)(src2 + j))); + _mm_storeu_si128((__m128i *)(dst + j), D); + } + src1 += i_src1; + src2 += i_src2; + dst += i_dst; + } + } +#else int i, j; __m128i S1, S2, D; @@ -81,7 +114,7 @@ void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, dst += i_dst; } } - +#endif } /* --------------------------------------------------------------------------- diff --git a/source/common/vec/intrinsic_sao.c b/source/common/vec/intrinsic_sao.c index a19b76d..30c6eed 100644 --- a/source/common/vec/intrinsic_sao.c +++ b/source/common/vec/intrinsic_sao.c @@ -45,9 +45,12 @@ #include #include + +#if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- */ -void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, +void SAO_on_block_sse128(xavs2_t *h, + pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param) { int start_x, end_x, start_y, end_y; @@ -631,7 +634,7 @@ void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i __m128i t0, t1, t2, t3, t4, src0, src1; __m128i mask ; __m128i shift_mask = _mm_set1_epi8(31); - int shift_bo = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; + int shift_bo = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; int end_x_16 = i_block_w - 15; r0 = _mm_set1_epi8((int8_t)(sao_param->startBand)); @@ -692,7 +695,846 @@ void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i exit(-1); } } +} +#else + /***************************************************************************** + * Copyright (C) 2016 uavs2dec project, + * National Engineering Laboratory for Video Technology(Shenzhen), + * Digital Media R&D Center at Peking University Shenzhen Graduate School, China + * Project Leader: Ronggang Wang + * + * Main Authors: Zhenyu Wang , Kui Fan + * Shenghao Zhang <1219759986@qq.com>艁卢 Bingjie Han, Kaili Yao, Hongbin Cao, Yueming Wang, + * Jing Su, Jiaying Yan, Junru Li + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at rgwang@pkusz.edu.cn. + *****************************************************************************/ +/* --------------------------------------------------------------------------- + */ +void SAO_on_block_eo_0_sse128(xavs2_t *h, + pel_t* dst, int i_dst, const pel_t* src, int i_src, + int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset) +{ + int start_x, end_x; + int x, y; + + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i off0, off1, off2, off3, off4; + __m128i s0, s1, s2; + __m128i t0, t1, t2, t3, t4, etype; + __m128i c0, c1, c2, c3, c4; + __m128i mask; + __m128i min_val = _mm_setzero_si128(); + __m128i max_val = _mm_set1_epi16(max_pixel); + + int end_x_8; + c0 = _mm_set1_epi16(-2); + c1 = _mm_set1_epi16(-1); + c2 = _mm_set1_epi16(0); + c3 = _mm_set1_epi16(1); + c4 = _mm_set1_epi16(2); + + off0 = _mm_set1_epi16((pel_t)sao_offset[0]); + off1 = _mm_set1_epi16((pel_t)sao_offset[1]); + off2 = _mm_set1_epi16((pel_t)sao_offset[2]); + off3 = _mm_set1_epi16((pel_t)sao_offset[3]); + off4 = _mm_set1_epi16((pel_t)sao_offset[4]); + start_x = lcu_avail[SAO_L] ? 0 : 1; + end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + end_x_8 = end_x - ((end_x - start_x) & 0x07); + + mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_8 - 1])); + if (i_block_w == 4) { + + + for (y = 0; y < i_block_h; y++) { + //diff = src[start_x] - src[start_x - 1]; + //leftsign = diff > 0 ? 1 : (diff < 0 ? -1 : 0); + s0 = _mm_loadu_si128((__m128i*) & src[start_x - 1]); + s1 = _mm_srli_si128(s0, 2); + s2 = _mm_srli_si128(s0, 4); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //leftsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //rightsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + _mm_maskmoveu_si128(t1, mask, (char*)(dst)); + + dst += i_dst; + src += i_src; + } + } + else { + + for (y = 0; y < i_block_h; y++) { + //diff = src[start_x] - src[start_x - 1]; + //leftsign = diff > 0 ? 1 : (diff < 0 ? -1 : 0); + for (x = start_x; x < end_x; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //leftsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //rightsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + _mm_maskmoveu_si128(t1, mask, (char*)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + } + } } +void SAO_on_block_bo_sse128(xavs2_t *h, + pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const SAOBlkParam* saoBlkParam) +{ + int x, y; + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i off0, off1, off2, off3; + __m128i s0, s1; + __m128i t0, t1, t2, t3; + __m128i mask; + __m128i min_val = _mm_setzero_si128(); + __m128i max_val = _mm_set1_epi16(max_pixel); + + + __m128i r0, r1, r2, r3; + int shift_bo = h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; + int end_x_8 = i_block_w - 7; + + r0 = _mm_set1_epi16(saoBlkParam->startBand); + r1 = _mm_set1_epi16((saoBlkParam->startBand + 1) % 32); + r2 = _mm_set1_epi16(saoBlkParam->deltaBand); + r3 = _mm_set1_epi16((saoBlkParam->deltaBand + 1) % 32); + off0 = _mm_set1_epi16(saoBlkParam->offset[0]); + off1 = _mm_set1_epi16(saoBlkParam->offset[1]); + off2 = _mm_set1_epi16(saoBlkParam->offset[2]); + off3 = _mm_set1_epi16(saoBlkParam->offset[3]); + + if (i_block_w == 4) { + mask = _mm_set_epi32(0, 0, -1, -1); + + for (y = 0; y < i_block_h; y++) { + s0 = _mm_loadu_si128((__m128i*)src); + + s1 = _mm_srai_epi16(s0, shift_bo); + + t0 = _mm_cmpeq_epi16(s1, r0); + t1 = _mm_cmpeq_epi16(s1, r1); + t2 = _mm_cmpeq_epi16(s1, r2); + t3 = _mm_cmpeq_epi16(s1, r3); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t0 = _mm_or_si128(t0, t1); + t2 = _mm_or_si128(t2, t3); + t0 = _mm_or_si128(t0, t2); + + t1 = _mm_adds_epi16(s0, t0); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + _mm_maskmoveu_si128(t1, mask, (char *)(dst)); + + dst += i_dst; + src += i_src; + } + } + else { + mask = _mm_load_si128((const __m128i*)intrinsic_mask[(i_block_w & 7) - 1]); + + for (y = 0; y < i_block_h; y++) { + for (x = 0; x < i_block_w; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x]); + + s1 = _mm_srai_epi16(s0, shift_bo); + + t0 = _mm_cmpeq_epi16(s1, r0); + t1 = _mm_cmpeq_epi16(s1, r1); + t2 = _mm_cmpeq_epi16(s1, r2); + t3 = _mm_cmpeq_epi16(s1, r3); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t0 = _mm_or_si128(t0, t1); + t2 = _mm_or_si128(t2, t3); + t0 = _mm_or_si128(t0, t2); + //src0 = _mm_adds_epi8(src0, t0); + + //add 8 nums once for possible overflow + t1 = _mm_adds_epi16(s0, t0); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x < end_x_8) { + _mm_storeu_si128((__m128i*) & dst[x], t1); + } + else { + _mm_maskmoveu_si128(t1, mask, (char *)(dst + x)); + } + + } + dst += i_dst; + src += i_src; + } + } + +} + +void SAO_on_block_eo_90_sse128(xavs2_t *h, + pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, const int* lcu_avail, const int* sao_offset) +{ + int start_y, end_y; + int x, y; + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i off0, off1, off2, off3, off4; + __m128i s0, s1, s2; + __m128i t0, t1, t2, t3, t4, etype; + __m128i c0, c1, c2, c3, c4; + __m128i mask; + __m128i min_val = _mm_setzero_si128(); + __m128i max_val = _mm_set1_epi16(max_pixel); + + int end_x_8 = i_block_w - 7; + c0 = _mm_set1_epi16(-2); + c1 = _mm_set1_epi16(-1); + c2 = _mm_set1_epi16(0); + c3 = _mm_set1_epi16(1); + c4 = _mm_set1_epi16(2); + + off0 = _mm_set1_epi16((pel_t)sao_offset[0]); + off1 = _mm_set1_epi16((pel_t)sao_offset[1]); + off2 = _mm_set1_epi16((pel_t)sao_offset[2]); + off3 = _mm_set1_epi16((pel_t)sao_offset[3]); + off4 = _mm_set1_epi16((pel_t)sao_offset[4]); + start_y = lcu_avail[SAO_T] ? 0 : 1; + end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); + + dst += start_y * i_dst; + src += start_y * i_src; + + if (i_block_w == 4) { + mask = _mm_set_epi32(0, 0, -1, -1); + + for (y = start_y; y < end_y; y++) { + s0 = _mm_loadu_si128((__m128i*)(src - i_src)); + s1 = _mm_loadu_si128((__m128i*)src); + s2 = _mm_loadu_si128((__m128i*)(src + i_src)); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + //add 8 nums once for possible overflow + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + _mm_maskmoveu_si128(t1, mask, (char *)(dst)); + + dst += i_dst; + src += i_src; + } + } + else { + if (i_block_w & 0x07) { + mask = _mm_set_epi32(0, 0, -1, -1); + + for (y = start_y; y < end_y; y++) { + for (x = 0; x < i_block_w; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x < end_x_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + _mm_maskmoveu_si128(t1, mask, (char *)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + } + } + else { + for (y = start_y; y < end_y; y++) { + for (x = 0; x < i_block_w; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + dst += i_dst; + src += i_src; + } + } + } +} + +void SAO_on_block_eo_135_sse128(xavs2_t *h, + pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, + const int* lcu_avail, const int* sao_offset) +{ + int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; + int x, y; + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i off0, off1, off2, off3, off4; + __m128i s0, s1, s2; + __m128i t0, t1, t2, t3, t4, etype; + __m128i c0, c1, c2, c3, c4; + __m128i min_val = _mm_setzero_si128(); + __m128i max_val = _mm_set1_epi16(max_pixel); + + __m128i mask_r0, mask_r, mask_rn; + int end_x_r0_8, end_x_r_8, end_x_rn_8; + + c0 = _mm_set1_epi16(-2); + c1 = _mm_set1_epi16(-1); + c2 = _mm_set1_epi16(0); + c3 = _mm_set1_epi16(1); + c4 = _mm_set1_epi16(2); + + off0 = _mm_set1_epi16((pel_t)sao_offset[0]); + off1 = _mm_set1_epi16((pel_t)sao_offset[1]); + off2 = _mm_set1_epi16((pel_t)sao_offset[2]); + off3 = _mm_set1_epi16((pel_t)sao_offset[3]); + off4 = _mm_set1_epi16((pel_t)sao_offset[4]); + + start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1; + end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; + start_x_r = lcu_avail[SAO_L] ? 0 : 1; + end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); + end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); + + end_x_r0_8 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x07); + end_x_r_8 = end_x_r - ((end_x_r - start_x_r) & 0x07); + end_x_rn_8 = end_x_rn - ((end_x_rn - start_x_rn) & 0x07); + + + //first row + for (x = start_x_r0; x < end_x_r0; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_r0_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_8 - 1])); + _mm_maskmoveu_si128(t1, mask_r0, (char *)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + + mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_8 - 1])); + //middle rows + for (y = 1; y < i_block_h - 1; y++) { + for (x = start_x_r; x < end_x_r; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_r_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + _mm_maskmoveu_si128(t1, mask_r, (char *)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + } + //last row + for (x = start_x_rn; x < end_x_rn; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src - 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src + 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_rn_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_8 - 1])); + _mm_maskmoveu_si128(t1, mask_rn, (char *)(dst + x)); + break; + } + } +} + +void SAO_on_block_eo_45_sse128(xavs2_t *h, + pel_t* dst, int i_dst, const pel_t* src, int i_src, int i_block_w, int i_block_h, + const int* lcu_avail, const int* sao_offset) +{ + int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; + int x, y; + int max_pixel = (1 << h->param->input_sample_bit_depth) - 1; + __m128i off0, off1, off2, off3, off4; + __m128i s0, s1, s2; + __m128i t0, t1, t2, t3, t4, etype; + __m128i c0, c1, c2, c3, c4; + __m128i min_val = _mm_setzero_si128(); + __m128i max_val = _mm_set1_epi16(max_pixel); + + __m128i mask_r0, mask_r, mask_rn; + int end_x_r0_8, end_x_r_8, end_x_rn_8; + + c0 = _mm_set1_epi16(-2); + c1 = _mm_set1_epi16(-1); + c2 = _mm_set1_epi16(0); + c3 = _mm_set1_epi16(1); + c4 = _mm_set1_epi16(2); + + off0 = _mm_set1_epi16((pel_t)sao_offset[0]); + off1 = _mm_set1_epi16((pel_t)sao_offset[1]); + off2 = _mm_set1_epi16((pel_t)sao_offset[2]); + off3 = _mm_set1_epi16((pel_t)sao_offset[3]); + off4 = _mm_set1_epi16((pel_t)sao_offset[4]); + + start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); + end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); + start_x_r = lcu_avail[SAO_L] ? 0 : 1; + end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); + start_x_rn = lcu_avail[SAO_DL] ? 0 : 1; + end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; + + end_x_r0_8 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x07); + end_x_r_8 = end_x_r - ((end_x_r - start_x_r) & 0x07); + end_x_rn_8 = end_x_rn - ((end_x_rn - start_x_rn) & 0x07); + + + //first row + for (x = start_x_r0; x < end_x_r0; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_r0_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_8 - 1])); + _mm_maskmoveu_si128(t1, mask_r0, (char *)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + + mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_8 - 1])); + //middle rows + for (y = 1; y < i_block_h - 1; y++) { + for (x = start_x_r; x < end_x_r; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_r_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + _mm_maskmoveu_si128(t1, mask_r, (char *)(dst + x)); + break; + } + } + dst += i_dst; + src += i_src; + } + for (x = start_x_rn; x < end_x_rn; x += 8) { + s0 = _mm_loadu_si128((__m128i*) & src[x - i_src + 1]); + s1 = _mm_loadu_si128((__m128i*) & src[x]); + s2 = _mm_loadu_si128((__m128i*) & src[x + i_src - 1]); + + t3 = _mm_min_epu16(s0, s1); + t1 = _mm_cmpeq_epi16(t3, s0); + t2 = _mm_cmpeq_epi16(t3, s1); + t0 = _mm_subs_epi16(t2, t1); //upsign + + t3 = _mm_min_epu16(s1, s2); + t1 = _mm_cmpeq_epi16(t3, s1); + t2 = _mm_cmpeq_epi16(t3, s2); + t3 = _mm_subs_epi16(t1, t2); //downsign + + etype = _mm_adds_epi16(t0, t3); //edgetype + + t0 = _mm_cmpeq_epi16(etype, c0); + t1 = _mm_cmpeq_epi16(etype, c1); + t2 = _mm_cmpeq_epi16(etype, c2); + t3 = _mm_cmpeq_epi16(etype, c3); + t4 = _mm_cmpeq_epi16(etype, c4); + + t0 = _mm_and_si128(t0, off0); + t1 = _mm_and_si128(t1, off1); + t2 = _mm_and_si128(t2, off2); + t3 = _mm_and_si128(t3, off3); + t4 = _mm_and_si128(t4, off4); + + t0 = _mm_adds_epi16(t0, t1); + t2 = _mm_adds_epi16(t2, t3); + t0 = _mm_adds_epi16(t0, t4); + t0 = _mm_adds_epi16(t0, t2);//get offset + + t1 = _mm_adds_epi16(t0, s1); + t1 = _mm_min_epi16(t1, max_val); + t1 = _mm_max_epi16(t1, min_val); + + if (x != end_x_rn_8) { + _mm_storeu_si128((__m128i*)(dst + x), t1); + } + else { + mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_8 - 1])); + _mm_maskmoveu_si128(t1, mask_rn, (char *)(dst + x)); + break; + } + } +} +#endif // !HIGH_BIT_DEPTH diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h index b0ec1aa..d5b4c70 100644 --- a/source/common/x86/dct8.h +++ b/source/common/x86/dct8.h @@ -28,14 +28,16 @@ #define XAVS2_I386_DCT8_H #define xavs2_dct_4x4_sse2 FPFX(dct_4x4_sse2) -void xavs2_dct_4x4_sse2 (const coeff_t *src, coeff_t *dst, int i_src); +void xavs2_dct_4x4_sse2 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); +#if defined(__AVX2__) #define xavs2_dct_4x4_avx2 FPFX(dct_4x4_avx2) void xavs2_dct_4x4_avx2 (const coeff_t *src, coeff_t *dst, int i_src); +#endif #define xavs2_dct_8x8_sse2 FPFX(dct_8x8_sse2) -void xavs2_dct_8x8_sse2 (const coeff_t *src, coeff_t *dst, int i_src); +void xavs2_dct_8x8_sse2 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_8x8_sse4 FPFX(dct_8x8_sse4) -void xavs2_dct_8x8_sse4 (const coeff_t *src, coeff_t *dst, int i_src); -#if ARCH_X86_64 +void xavs2_dct_8x8_sse4 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_src); +#if ARCH_X86_64 && defined(__AVX2__) #define xavs2_dct_8x8_avx2 FPFX(dct_8x8_avx2) void xavs2_dct_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_16x16_avx2 FPFX(dct_16x16_avx2) @@ -45,14 +47,14 @@ void xavs2_dct_32x32_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #endif #define xavs2_idct_4x4_sse2 FPFX(idct_4x4_sse2) -void xavs2_idct_4x4_sse2 (const coeff_t *src, coeff_t *dst, int i_dst); +void xavs2_idct_4x4_sse2 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_8x8_ssse3 FPFX(idct_8x8_ssse3) -void xavs2_idct_8x8_ssse3 (const coeff_t *src, coeff_t *dst, int i_dst); -#if ARCH_X86_64 +void xavs2_idct_8x8_ssse3 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); +#define xavs2_idct_8x8_sse2 FPFX(idct_8x8_sse2) +void xavs2_idct_8x8_sse2 (xavs2_t *h, const coeff_t *src, coeff_t *dst, int i_dst); +#if ARCH_X86_64 && defined(__AVX2__) #define xavs2_idct_4x4_avx2 FPFX(idct_4x4_avx2) void xavs2_idct_4x4_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); -#define xavs2_idct_8x8_sse2 FPFX(idct_8x8_sse2) -void xavs2_idct_8x8_sse2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_8x8_avx2 FPFX(idct_8x8_avx2) void xavs2_idct_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_16x16_avx2 FPFX(idct_16x16_avx2) diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h index c70c3ef..d6b69b2 100644 --- a/source/common/x86/pixel-util.h +++ b/source/common/x86/pixel-util.h @@ -38,11 +38,12 @@ void xavs2_getResidual16_sse4(const pel_t *fenc, const pel_t *pred, int16_t *res void xavs2_getResidual32_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual32_sse4 FPFX(getResidual32_sse4) void xavs2_getResidual32_sse4(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); +#if defined(__AVX2__) #define xavs2_getResidual16_avx2 FPFX(getResidual16_avx2) void xavs2_getResidual16_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual32_avx2 FPFX(getResidual32_avx2) void xavs2_getResidual32_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); - +#endif #define xavs2_transpose4_sse2 FPFX(transpose4_sse2) void xavs2_transpose4_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose8_sse2 FPFX(transpose8_sse2) @@ -53,7 +54,7 @@ void xavs2_transpose16_sse2(pel_t *dst, const pel_t *src, intptr_t stride); void xavs2_transpose32_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose64_sse2 FPFX(transpose64_sse2) void xavs2_transpose64_sse2(pel_t *dst, const pel_t *src, intptr_t stride); - +#if defined(__AVX2__) #define xavs2_transpose8_avx2 FPFX(transpose8_avx2) void xavs2_transpose8_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose16_avx2 FPFX(transpose16_avx2) @@ -62,7 +63,7 @@ void xavs2_transpose16_avx2(pel_t *dst, const pel_t *src, intptr_t stride); void xavs2_transpose32_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose64_avx2 FPFX(transpose64_avx2) void xavs2_transpose64_avx2(pel_t *dst, const pel_t *src, intptr_t stride); - +#endif #define xavs2_count_nonzero_4x4_ssse3 FPFX(count_nonzero_4x4_ssse3) int xavs2_count_nonzero_4x4_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_8x8_ssse3 FPFX(count_nonzero_8x8_ssse3) @@ -71,6 +72,7 @@ int xavs2_count_nonzero_8x8_ssse3(const int16_t *quantCoeff); int xavs2_count_nonzero_16x16_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_32x32_ssse3 FPFX(count_nonzero_32x32_ssse3) int xavs2_count_nonzero_32x32_ssse3(const int16_t *quantCoeff); +#if defined(__AVX2__) #define xavs2_count_nonzero_4x4_avx2 FPFX(count_nonzero_4x4_avx2) int xavs2_count_nonzero_4x4_avx2(const int16_t *quantCoeff); #define xavs2_count_nonzero_8x8_avx2 FPFX(count_nonzero_8x8_avx2) @@ -79,11 +81,11 @@ int xavs2_count_nonzero_8x8_avx2(const int16_t *quantCoeff); int xavs2_count_nonzero_16x16_avx2(const int16_t *quantCoeff); #define xavs2_count_nonzero_32x32_avx2 FPFX(count_nonzero_32x32_avx2) int xavs2_count_nonzero_32x32_avx2(const int16_t *quantCoeff); - -#define xavs2_weight_pp_sse4 FPFX(weight_pp_sse4) -void xavs2_weight_pp_sse4(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); #define xavs2_weight_pp_avx2 FPFX(weight_pp_avx2) void xavs2_weight_pp_avx2(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +#endif +#define xavs2_weight_pp_sse4 FPFX(weight_pp_sse4) +void xavs2_weight_pp_sse4(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); #define xavs2_weight_sp_sse4 FPFX(weight_sp_sse4) void xavs2_weight_sp_sse4(const int16_t *src, pel_t *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); @@ -103,17 +105,18 @@ float xavs2_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width); #define xavs2_scale1D_128to64_ssse3 FPFX(scale1D_128to64_ssse3) void xavs2_scale1D_128to64_ssse3(pel_t*, const pel_t*); -#define xavs2_scale1D_128to64_avx2 FPFX(scale1D_128to64_avx2) -void xavs2_scale1D_128to64_avx2(pel_t*, const pel_t*); #define xavs2_scale2D_64to32_ssse3 FPFX(scale2D_64to32_ssse3) void xavs2_scale2D_64to32_ssse3(pel_t*, const pel_t*, intptr_t); +#if defined(__AVX2__) +#define xavs2_scale1D_128to64_avx2 FPFX(scale1D_128to64_avx2) +void xavs2_scale1D_128to64_avx2(pel_t*, const pel_t*); #define xavs2_scale2D_64to32_avx2 FPFX(scale2D_64to32_avx2) void xavs2_scale2D_64to32_avx2(pel_t*, const pel_t*, intptr_t); - -#define xavs2_scanPosLast_x64 FPFX(scanPosLast_x64) -int xavs2_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize); #define xavs2_scanPosLast_avx2_bmi2 FPFX(scanPosLast_avx2_bmi2) int xavs2_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize); +#endif +#define xavs2_scanPosLast_x64 FPFX(scanPosLast_x64) +int xavs2_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize); #define xavs2_findPosFirstLast_ssse3 FPFX(findPosFirstLast_ssse3) uint32_t xavs2_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]); @@ -123,7 +126,8 @@ uint32_t xavs2_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, int #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \ - void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); + void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(xavs2_t *h, pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); + #define CHROMA_PIXELSUB_DEF(cpu) \ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \ @@ -139,7 +143,7 @@ uint32_t xavs2_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, int #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \ void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \ - void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); + void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(xavs2_t *h, pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); #define LUMA_PIXELSUB_DEF(cpu) \ SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h index 06370b7..266e4cc 100644 --- a/source/common/x86/pixel.h +++ b/source/common/x86/pixel.h @@ -114,7 +114,7 @@ FUNCDEF_PU(void, pixel_sad_x3, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*, intptr_t, int32_t*);\ FUNCDEF_PU(void, pixel_sad_x4, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*, const pel_t*, intptr_t, int32_t*);\ FUNCDEF_PU(void, pixel_avg, cpu, pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int);\ - FUNCDEF_PU(void, pixel_add_ps, cpu, pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);\ + FUNCDEF_PU(void, pixel_add_ps, cpu, xavs2_t *h, pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);\ FUNCDEF_PU(void, pixel_sub_ps, cpu, coeff_t* a, intptr_t dstride, const pel_t* b0, const pel_t* b1, intptr_t sstride0, intptr_t sstride1);\ FUNCDEF_PU(int, pixel_satd, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ FUNCDEF_PU(int, pixel_sad, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ diff --git a/source/common/x86/pixeladd8.asm b/source/common/x86/pixeladd8.asm index a0f2fb5..2d39648 100644 --- a/source/common/x86/pixeladd8.asm +++ b/source/common/x86/pixeladd8.asm @@ -34,11 +34,11 @@ SECTION .text cextern pw_pixel_max ;----------------------------------------------------------------------------- -; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_4x4(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_4x4, 6, 6, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 add r4, r4 @@ -68,7 +68,7 @@ cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcSt RET %else INIT_XMM sse4 -cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_4x4, 6, 6, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 add r5, r5 pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] @@ -101,12 +101,12 @@ cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcSt ;----------------------------------------------------------------------------- -; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_4x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W4_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_4x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 mov r6d, %2/4 @@ -143,7 +143,7 @@ cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcS RET %else INIT_XMM sse4 -cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_4x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: @@ -187,12 +187,12 @@ PIXEL_ADD_PS_W4_H4 4, 16 ;----------------------------------------------------------------------------- -; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_8x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W8_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_8x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 @@ -235,7 +235,7 @@ cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcS RET %else INIT_XMM sse4 -cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_8x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: @@ -280,12 +280,12 @@ PIXEL_ADD_PS_W8_H4 8, 32 ;----------------------------------------------------------------------------- -; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_16x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_16x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 @@ -352,7 +352,7 @@ cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src RET %else INIT_XMM sse4 -cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_16x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: @@ -413,13 +413,13 @@ PIXEL_ADD_PS_W16_H4 16, 32 PIXEL_ADD_PS_W16_H4 16, 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_16x16(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 -cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_16x%1, 6, 10, 4, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m3, [pw_pixel_max] pxor m2, m2 mov r6d, %1/4 @@ -464,7 +464,7 @@ cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, sr %endif %else INIT_YMM avx2 -cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_16x%1, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 .loop: @@ -519,12 +519,12 @@ PIXEL_ADD_PS_W16_H4_avx2 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_32x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_32x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 @@ -588,7 +588,7 @@ cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src RET %else INIT_XMM sse4 -cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_32x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: @@ -644,13 +644,13 @@ PIXEL_ADD_PS_W32_H2 32, 32 PIXEL_ADD_PS_W32_H2 32, 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_32x32(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 -cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_32x%1, 6, 10, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 @@ -716,7 +716,7 @@ cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, sr %else %if ARCH_X86_64 INIT_YMM avx2 -cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_32x%1, 6, 10, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 lea r7, [r4 * 3] @@ -786,12 +786,12 @@ PIXEL_ADD_PS_W32_H4_avx2 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_64x%2(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_64x%2, 6, 7, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 @@ -903,7 +903,7 @@ cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, src RET %else INIT_XMM sse4 -cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_64x%2, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: @@ -995,13 +995,13 @@ PIXEL_ADD_PS_W64_H2 64, 16 PIXEL_ADD_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; void pixel_add_ps_64x64(xavs2_t* bb, pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 -cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_64x%1, 6, 10, 6, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 @@ -1110,7 +1110,7 @@ cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, sr %endif %else INIT_YMM avx2 -cglobal pixel_add_ps_64x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 +cglobal pixel_add_ps_64x%1, 6, 7, 8, bb, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/2 add r5, r5 .loop: diff --git a/source/configw.h b/source/configw.h index 0021edd..cb7d12e 100644 --- a/source/configw.h +++ b/source/configw.h @@ -37,7 +37,7 @@ #ifndef XAVS2_CONFIGW_H #define XAVS2_CONFIGW_H -#if defined(__ICL) || defined(_MSC_VER) +#if defined(__ICL) || defined(_MSC_VER) || defined(__MINGW64_VERSION_MAJOR) /* arch */ #define ARCH_X86 1 @@ -57,7 +57,7 @@ #ifndef __SSE__ #define __SSE__ #endif -#define HAVE_MMX 1 /* X86 */ +#define HAVE_MMX 0 /* X86 */ #define HAVE_ALTIVEC 0 /* ALTIVEC */ #define HAVE_ALTIVEC_H 0 #define HAVE_ARMV6 0 diff --git a/source/encoder/alf.c b/source/encoder/alf.c index 4e69076..9826e23 100644 --- a/source/encoder/alf.c +++ b/source/encoder/alf.c @@ -99,17 +99,17 @@ typedef struct dh_nc { } DhNc; typedef struct { - int64_t m_autoCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; // auto-correlation matrix - double m_crossCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF]; // cross-correlation - double pixAcc[NO_VAR_BINS]; + long long int m_autoCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; // auto-correlation matrix + double m_crossCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF]; // cross-correlation + double pixAcc[NO_VAR_BINS]; } AlfCorrData; typedef struct { double m_cross_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF]; - int64_t m_auto_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; + long long int m_auto_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; double m_cross_temp[ALF_MAX_NUM_COEF]; double m_pixAcc_merged[NO_VAR_BINS]; - int64_t m_auto_temp[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; + long long int m_auto_temp[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; int m_coeffNoFilter[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int m_filterCoeffSym[NO_VAR_BINS][ALF_MAX_NUM_COEF]; @@ -231,9 +231,9 @@ void copyALFparam(ALFParam *dst, ALFParam *src, int componentID) * calculate the correlation matrix for Luma */ static -void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i_org, pel_t *rec, int i_rec, +void calcCorrOneCompRegionLuma8(xavs2_t *h, alf_ctx_t *Enc_ALF, pel8_t *org, int i_org, pel8_t *rec, int i_rec, int yPos, int xPos, int height, int width, - int64_t m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], + long long int m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double m_crossCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAcc, int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) @@ -245,15 +245,114 @@ void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i int endPosLuma = isBelowAvail ? (yPos + height - 4) : (yPos + height); int xOffSetLeft = isLeftAvail ? -3 : 0; int xOffSetRight = isRightAvail ? 3 : 0; - pel_t *imgPad = rec; - pel_t *imgOrg = org; + pel8_t *imgPad = rec; + pel8_t *imgOrg = org; int yUp, yBottom; int xLeft, xRight; int ELocal[ALF_MAX_NUM_COEF]; - pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; + pel8_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; int i, j, k, l, yLocal, varInd; - int64_t(*E)[9]; + long long int(*E)[9]; + double *yy; + + imgPad += startPosLuma * i_rec; + imgOrg += startPosLuma * i_org; + + varInd = Enc_ALF->tab_lcu_region[(yPos >> h->i_lcu_level) * h->i_width_in_lcu + (xPos >> h->i_lcu_level)]; + int step = 1; + if (IS_ALG_ENABLE(OPT_FAST_ALF)) { + step = 2; + } + for (i = startPosLuma; i < endPosLuma; i += step) { + yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 1); + yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 1); + imgPad1 = imgPad + (yBottom - i) * i_rec; + imgPad2 = imgPad + (yUp - i) * i_rec; + + yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 2); + yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 2); + imgPad3 = imgPad + (yBottom - i) * i_rec; + imgPad4 = imgPad + (yUp - i) * i_rec; + + yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 3); + yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 3); + imgPad5 = imgPad + (yBottom - i) * i_rec; + imgPad6 = imgPad + (yUp - i) * i_rec; + + for (j = xPos; j < xPosEnd; j += step) { + memset(ELocal, 0, N * sizeof(int)); + + ELocal[0] = (imgPad5[j] + imgPad6[j]); + ELocal[1] = (imgPad3[j] + imgPad4[j]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1); + ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]); + ELocal[3] = (imgPad1[j ] + imgPad2[j ]); + ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]); + ELocal[7] = (imgPad[xRight] + imgPad[xLeft]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2); + ELocal[6] = (imgPad[xRight] + imgPad[xLeft]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3); + ELocal[5] = (imgPad[xRight] + imgPad[xLeft]); + ELocal[8] = (imgPad[j ]); + + yLocal = imgOrg[j]; + pixAcc[varInd] += (yLocal * yLocal); + E = m_autoCorr[varInd]; + yy = m_crossCorr[varInd]; + + for (k = 0; k < N; k++) { + for (l = k; l < N; l++) { + E[k][l] += (ELocal[k] * ELocal[l]); + } + yy[k] += (double)(ELocal[k] * yLocal); + } + } + + imgPad += i_rec; + imgOrg += i_org; + } + + for (varInd = 0; varInd < NO_VAR_BINS; varInd++) { + E = m_autoCorr[varInd]; + for (k = 1; k < N; k++) { + for (l = 0; l < k; l++) { + E[k][l] = E[l][k]; + } + } + } +} + +static +void calcCorrOneCompRegionLuma10(xavs2_t *h, alf_ctx_t *Enc_ALF, pel10_t *org, int i_org, pel10_t *rec, int i_rec, + int yPos, int xPos, int height, int width, + long long int m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], + double m_crossCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], + double *pixAcc, + int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) +{ + int xPosEnd = xPos + width; + int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0]; + + int startPosLuma = isAboveAvail ? (yPos - 4) : yPos; + int endPosLuma = isBelowAvail ? (yPos + height - 4) : (yPos + height); + int xOffSetLeft = isLeftAvail ? -3 : 0; + int xOffSetRight = isRightAvail ? 3 : 0; + pel10_t *imgPad = rec; + pel10_t *imgOrg = org; + int yUp, yBottom; + int xLeft, xRight; + + int ELocal[ALF_MAX_NUM_COEF]; + pel10_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; + int i, j, k, l, yLocal, varInd; + long long int(*E)[9]; double *yy; imgPad += startPosLuma * i_rec; @@ -333,8 +432,8 @@ void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i * calculate the correlation matrix for Chroma */ static -void calcCorrOneCompRegionChma(xavs2_t *h, pel_t *org, int i_org, pel_t *rec, int i_rec, int yPos, int xPos, int height, int width, - int64_t m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr, +void calcCorrOneCompRegionChma8(xavs2_t *h, pel8_t *org, int i_org, pel8_t *rec, int i_rec, int yPos, int xPos, int height, int width, + long long int m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr, int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) { int xPosEnd = xPos + width; @@ -344,13 +443,102 @@ void calcCorrOneCompRegionChma(xavs2_t *h, pel_t *org, int i_org, pel_t *rec, in int endPosChroma = isBelowAvail ? (yPos + height - 4) : (yPos + height); int xOffSetLeft = isLeftAvail ? -3 : 0; int xOffSetRight = isRightAvail ? 3 : 0; - pel_t *imgPad = rec; - pel_t *imgOrg = org; + pel8_t *imgPad = rec; + pel8_t *imgOrg = org; int yUp, yBottom; int xLeft, xRight; int ELocal[ALF_MAX_NUM_COEF]; - pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; + pel8_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; + int i, j, k, l, yLocal; + + imgPad += startPosChroma * i_rec; + imgOrg += startPosChroma * i_org; + + int step = 1; + if (IS_ALG_ENABLE(OPT_FAST_ALF)) { + step = 2; + } + for (i = startPosChroma; i < endPosChroma; i += step) { + yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 1); + yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 1); + imgPad1 = imgPad + (yBottom - i) * i_rec; + imgPad2 = imgPad + (yUp - i) * i_rec; + + yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 2); + yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 2); + imgPad3 = imgPad + (yBottom - i) * i_rec; + imgPad4 = imgPad + (yUp - i) * i_rec; + + yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 3); + yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 3); + imgPad5 = imgPad + (yBottom - i) * i_rec; + imgPad6 = imgPad + (yUp - i) * i_rec; + + for (j = xPos; j < xPosEnd; j += step) { + memset(ELocal, 0, N * sizeof(int)); + + ELocal[0] = (imgPad5[j] + imgPad6[j]); + ELocal[1] = (imgPad3[j] + imgPad4[j]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1); + ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]); + ELocal[3] = (imgPad1[j ] + imgPad2[j ]); + ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]); + ELocal[7] = (imgPad[xRight] + imgPad[xLeft]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2); + ELocal[6] = (imgPad[xRight] + imgPad[xLeft]); + + xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3); + xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3); + ELocal[5] = (imgPad[xRight] + imgPad[xLeft]); + ELocal[8] = (imgPad[j ]); + + yLocal = (int)imgOrg[j]; + + for (k = 0; k < N; k++) { + m_autoCorr[k][k] += ELocal[k] * ELocal[k]; + for (l = k + 1; l < N; l++) { + m_autoCorr[k][l] += ELocal[k] * ELocal[l]; + } + + m_crossCorr[k] += yLocal * ELocal[k]; + } + } + + imgPad += i_rec; + imgOrg += i_org; + } + + for (j = 0; j < N - 1; j++) { + for (i = j + 1; i < N; i++) { + m_autoCorr[i][j] = m_autoCorr[j][i]; + } + } +} + +static +void calcCorrOneCompRegionChma10(xavs2_t *h, pel10_t *org, int i_org, pel10_t *rec, int i_rec, int yPos, int xPos, int height, int width, + long long int m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr, + int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) +{ + int xPosEnd = xPos + width; + const int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0]; + + int startPosChroma = isAboveAvail ? (yPos - 4) : yPos; + int endPosChroma = isBelowAvail ? (yPos + height - 4) : (yPos + height); + int xOffSetLeft = isLeftAvail ? -3 : 0; + int xOffSetRight = isRightAvail ? 3 : 0; + pel10_t *imgPad = rec; + pel10_t *imgOrg = org; + int yUp, yBottom; + int xLeft, xRight; + + int ELocal[ALF_MAX_NUM_COEF]; + pel10_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; int i, j, k, l, yLocal; imgPad += startPosChroma * i_rec; @@ -451,7 +639,7 @@ void deriveBoundaryAvail(xavs2_t *h, int pic_x, int pic_y, int size_lcu = 1 << h->i_lcu_level; int mb_x, mb_y; //int pic_mb_width = h->i_width_in_mincu; - //cu_info_t *cuCurr, *cuLeft, *cuRight, *cuAbove, *cuBelow; + //cu_info_t *cuCurr, *cuLeft, *cuRight, *cuAbove, *cuBelow; mb_x = pic_x >> MIN_CU_SIZE_IN_BIT; mb_y = pic_y >> MIN_CU_SIZE_IN_BIT; @@ -514,8 +702,9 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y, reset_alfCorr(alfCorr, compIdx); formatShift = 1; - calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx], - p_rec->planes[compIdx], p_rec->i_stride[compIdx], + if (h->param->input_sample_bit_depth == 8) { + calcCorrOneCompRegionChma8(h, p_org->planes8[compIdx], p_org->i_stride[compIdx], + p_rec->planes8[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], @@ -524,9 +713,9 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y, compIdx = IMG_V; alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; reset_alfCorr(alfCorr, compIdx); - //V分量的ypos, xpos, height, width四个值与U分量一样,不需要修改 - calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx], - p_rec->planes[compIdx], p_rec->i_stride[compIdx], + //V鍒嗛噺鐨剏pos, xpos, height, width鍥涗釜鍊间笌U鍒嗛噺涓鏍凤紝涓嶉渶瑕佷慨鏀 + calcCorrOneCompRegionChma8(h, p_org->planes8[compIdx], p_org->i_stride[compIdx], + p_rec->planes8[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], @@ -536,12 +725,42 @@ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y, alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; reset_alfCorr(alfCorr, compIdx); formatShift = 0; - calcCorrOneCompRegionLuma(h, Enc_ALF, p_org->planes[compIdx], p_org->i_stride[compIdx], - p_rec->planes[compIdx], p_rec->i_stride[compIdx], + calcCorrOneCompRegionLuma8(h, Enc_ALF, p_org->planes8[compIdx], p_org->i_stride[compIdx], + p_rec->planes8[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr, alfCorr->m_crossCorr, alfCorr->pixAcc, isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); + } else { + calcCorrOneCompRegionChma10(h, p_org->planes10[compIdx], p_org->i_stride[compIdx], + p_rec->planes10[compIdx], p_rec->i_stride[compIdx], + ctuYPos >> formatShift, ctuXPos >> formatShift, + ctuHeight >> formatShift, ctuWidth >> formatShift, + alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], + isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); + + compIdx = IMG_V; + alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; + reset_alfCorr(alfCorr, compIdx); + //V鍒嗛噺鐨剏pos, xpos, height, width鍥涗釜鍊间笌U鍒嗛噺涓鏍凤紝涓嶉渶瑕佷慨鏀 + calcCorrOneCompRegionChma10(h, p_org->planes10[compIdx], p_org->i_stride[compIdx], + p_rec->planes10[compIdx], p_rec->i_stride[compIdx], + ctuYPos >> formatShift, ctuXPos >> formatShift, + ctuHeight >> formatShift, ctuWidth >> formatShift, + alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], + isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); + + compIdx = IMG_Y; + alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; + reset_alfCorr(alfCorr, compIdx); + formatShift = 0; + calcCorrOneCompRegionLuma10(h, Enc_ALF, p_org->planes10[compIdx], p_org->i_stride[compIdx], + p_rec->planes10[compIdx], p_rec->i_stride[compIdx], + ctuYPos >> formatShift, ctuXPos >> formatShift, + ctuHeight >> formatShift, ctuWidth >> formatShift, + alfCorr->m_autoCorr, alfCorr->m_crossCorr, alfCorr->pixAcc, + isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); + } } @@ -560,7 +779,7 @@ static void mergeFrom(AlfCorrData *dst, AlfCorrData *src, int *mergeTable, int doPixAccMerge, int componentID) { int numCoef = ALF_MAX_NUM_COEF; - int64_t (*srcE)[ALF_MAX_NUM_COEF], (*dstE)[ALF_MAX_NUM_COEF]; + long long int (*srcE)[ALF_MAX_NUM_COEF], (*dstE)[ALF_MAX_NUM_COEF]; double *srcy, *dsty; int maxFilterSetSize, j, i, varInd, filtIdx; @@ -651,7 +870,7 @@ static uint32_t estimateALFBitrateInPicHeader(ALFParam *alfPicParam) */ static long xFastFiltDistEstimation(alf_ctx_t *Enc_ALF, - int64_t ppdE[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], + long long int ppdE[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pdy, int *piCoeff, int iFiltLength) { //static memory @@ -711,13 +930,73 @@ long estimateFilterDistortion(alf_ctx_t *Enc_ALF, int compIdx, AlfCorrData *alfC /* --------------------------------------------------------------------------- */ static -dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, +dist_t calcAlfLCUDist8(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, + int ypos, int xpos, int height, int width, int isAboveAvail, + pel8_t *picSrc, int i_src, pel8_t *picCmp, int i_cmp) +{ + dist_t dist = 0; + pel8_t *pelCmp = picCmp; + pel8_t *pelSrc = picSrc; + + int notSkipLinesRightVB = TRUE; + int notSkipLinesBelowVB = TRUE; + //int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight; + + //numLCUInPicHeight = h->i_height_in_lcu; + //numLCUInPicWidth = h->i_width_in_lcu; + //NumCUsInFrame = numLCUInPicHeight * numLCUInPicWidth; + + switch (compIdx) { + case IMG_U: + case IMG_V: + if (!notSkipLinesBelowVB) { + height = height - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1); + } + + if (!notSkipLinesRightVB) { + width = width - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1); + } + + if (isAboveAvail) { + pelSrc += ((ypos - 4) * i_src) + xpos; + pelCmp += ((ypos - 4) * i_cmp) + xpos; + } else { + pelSrc += (ypos * i_src) + xpos; + pelCmp += (ypos * i_cmp) + xpos; + } + break; + default: + // case IMG_Y: + if (!notSkipLinesBelowVB) { + height = height - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1); + } + + if (!notSkipLinesRightVB) { + width = width - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1); + } + + pelCmp = picCmp + (ypos * i_cmp) + xpos; + pelSrc = picSrc + (ypos * i_src) + xpos; + break; + } + if (PART_INDEX(width, height) == LUMA_INVALID) { + uint32_t uiShift = Enc_ALF->m_uiBitIncrement << 1; + dist += g_funcs.pixf.ssd_block8(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift; + } else { + dist += g_funcs.pixf.ssd8[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp); + } + + return dist; +} + +static +dist_t calcAlfLCUDist10(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, int ypos, int xpos, int height, int width, int isAboveAvail, - pel_t *picSrc, int i_src, pel_t *picCmp, int i_cmp) + pel10_t *picSrc, int i_src, pel10_t *picCmp, int i_cmp) { dist_t dist = 0; - pel_t *pelCmp = picCmp; - pel_t *pelSrc = picSrc; + pel10_t *pelCmp = picCmp; + pel10_t *pelSrc = picSrc; int notSkipLinesRightVB = TRUE; int notSkipLinesBelowVB = TRUE; @@ -762,9 +1041,9 @@ dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, } if (PART_INDEX(width, height) == LUMA_INVALID) { uint32_t uiShift = Enc_ALF->m_uiBitIncrement << 1; - dist += g_funcs.pixf.ssd_block(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift; + dist += g_funcs.pixf.ssd_block10(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift; } else { - dist += g_funcs.pixf.ssd[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp); + dist += g_funcs.pixf.ssd10[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp); } return dist; @@ -774,7 +1053,7 @@ dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, * ALF filter on CTB */ static -void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, +void filterOneCTB8(xavs2_t *h, alf_ctx_t *Enc_ALF, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, int compIdx, ALFParam *alfParam, int ypos, int height, int xpos, int width, int isAboveAvail, int isBelowAvail) { @@ -792,11 +1071,37 @@ void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t coef = Enc_ALF->m_filterCoeffSym[0]; } + g_funcs.alf_flt8[0](h, p_dst, i_dst, p_src, i_src, + xpos, ypos, width, height, coef, + isAboveAvail, isBelowAvail); + g_funcs.alf_flt8[1](h, p_dst, i_dst, p_src, i_src, + xpos, ypos, width, height, coef, + isAboveAvail, isBelowAvail); +} + +static +void filterOneCTB10(xavs2_t *h, alf_ctx_t *Enc_ALF, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, + int compIdx, ALFParam *alfParam, int ypos, int height, int xpos, int width, + int isAboveAvail, int isBelowAvail) +{ + int *coef; + + //reconstruct coefficients to m_filterCoeffSym and m_varIndTab + reconstructCoefInfo(compIdx, alfParam, Enc_ALF->m_filterCoeffSym, Enc_ALF->m_varIndTab); //reconstruct ALF coefficients & related parameters + + //derive CTB start positions, width, and height. If the boundary is not available, skip boundary samples. + + if (compIdx == IMG_Y) { + int var = Enc_ALF->tab_lcu_region[(ypos >> h->i_lcu_level) * h->i_width_in_lcu + (xpos >> h->i_lcu_level)]; + coef = Enc_ALF->m_filterCoeffSym[Enc_ALF->m_varIndTab[var]]; + } else { + coef = Enc_ALF->m_filterCoeffSym[0]; + } - g_funcs.alf_flt[0](p_dst, i_dst, p_src, i_src, + g_funcs.alf_flt10[0](h, p_dst, i_dst, p_src, i_src, xpos, ypos, width, height, coef, isAboveAvail, isBelowAvail); - g_funcs.alf_flt[1](p_dst, i_dst, p_src, i_src, + g_funcs.alf_flt10[1](h, p_dst, i_dst, p_src, i_src, xpos, ypos, width, height, coef, isAboveAvail, isBelowAvail); } @@ -804,7 +1109,7 @@ void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE -void copyOneAlfBlk(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int ypos, int xpos, +void copyOneAlfBlk8(xavs2_t *h, pel8_t *p_dst, int i_dst, pel8_t *p_src, int i_src, int ypos, int xpos, int height, int width, int isAboveAvail, int isBelowAvail) { int startPos = isAboveAvail ? (ypos - 4) : ypos; @@ -812,7 +1117,19 @@ void copyOneAlfBlk(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int ypos, i p_dst += (startPos * i_dst) + xpos; p_src += (startPos * i_src) + xpos; - g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, width, endPos - startPos); + g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, width, endPos - startPos); +} + +static ALWAYS_INLINE +void copyOneAlfBlk10(xavs2_t *h, pel10_t *p_dst, int i_dst, pel10_t *p_src, int i_src, int ypos, int xpos, + int height, int width, int isAboveAvail, int isBelowAvail) +{ + int startPos = isAboveAvail ? (ypos - 4) : ypos; + int endPos = isBelowAvail ? (ypos + height - 4) : ypos + height; + p_dst += (startPos * i_dst) + xpos; + p_src += (startPos * i_src) + xpos; + + g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, width, endPos - startPos); } /* --------------------------------------------------------------------------- @@ -941,9 +1258,10 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL int i_org = 0; int i_rec_before = 0; int i_rec_after = 0; - pel_t *p_org_pixel = NULL; - pel_t *p_rec_before = NULL; - pel_t *p_rec_after = NULL; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_org_pixel = NULL; + pel8_t *p_rec_before = NULL; + pel8_t *p_rec_after = NULL; double lambda_luma, lambda_chroma; int img_height, img_width; int size_lcu = 1 << h->i_lcu_level; @@ -988,20 +1306,20 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL } formatShift = (compIdx == IMG_Y) ? 0 : 1; - p_org_pixel = p_org->planes[compIdx]; + p_org_pixel = p_org->planes8[compIdx]; i_org = p_org->i_stride[compIdx]; - p_rec_before = p_rec->planes[compIdx]; + p_rec_before = p_rec->planes8[compIdx]; i_rec_before = p_rec->i_stride[compIdx]; - p_rec_after = p_dst->planes[compIdx]; + p_rec_after = p_dst->planes8[compIdx]; i_rec_after = p_dst->i_stride[compIdx]; // ALF on - filterOneCTB(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx, + filterOneCTB8(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx, &alfPictureParam[compIdx], ctuYPos >> formatShift, ctuHeight >> formatShift, ctuXPos >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail); - distEnc = calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, + distEnc = calcAlfLCUDist8(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_after, i_rec_after); - distEnc -= calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, + distEnc -= calcAlfLCUDist8(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_before, i_rec_before); h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); @@ -1022,7 +1340,7 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE; if (!h->is_alf_lcu_on[ctu][compIdx]) { - copyOneAlfBlk(p_rec_after, i_rec_after, p_rec_before, i_rec_before, + copyOneAlfBlk8(h, p_rec_after, i_rec_after, p_rec_before, i_rec_before, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail); } @@ -1060,12 +1378,139 @@ void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, AL h->is_alf_lcu_on[ctu][compIdx] = FALSE; } - g_funcs.plane_copy(p_dst->planes[compIdx], p_dst->i_stride[compIdx], - p_rec->planes[compIdx], p_rec->i_stride[compIdx], + g_funcs.plane_copy8(h, p_dst->planes8[compIdx], p_dst->i_stride[compIdx], + p_rec->planes8[compIdx], p_rec->i_stride[compIdx], p_rec->i_width[compIdx], p_rec->i_lines[compIdx]); } } } + } else { + pel10_t *p_org_pixel = NULL; + pel10_t *p_rec_before = NULL; + pel10_t *p_rec_after = NULL; + double lambda_luma, lambda_chroma; + int img_height, img_width; + int size_lcu = 1 << h->i_lcu_level; + int ctux, ctuy; + int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight; + int rate, noFilters; + + h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_initial); + h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); + + img_height = h->i_height; + img_width = h->i_width; + numLCUInPicHeight = h->i_height_in_lcu; + numLCUInPicWidth = h->i_width_in_lcu; + NumCUsInFrame = numLCUInPicHeight * numLCUInPicWidth; + + lambda_luma = lambda; //VKTBD lambda is not correct + lambda_chroma = LAMBDA_SCALE_CHROMA * lambda_luma; + for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { + distBestPic[compIdx] = 0; + rateBestPic[compIdx] = 0; + } + + for (ctuy = 0, ctu = 0; ctuy < numLCUInPicHeight; ctuy++) { + //derive CTU height + ctuYPos = ctuy * size_lcu; + ctuHeight = XAVS2_MIN(img_height - ctuYPos, size_lcu); + for (ctux = 0; ctux < numLCUInPicWidth; ctux++, ctu++) { + //derive CTU width + ctuXPos = ctux * size_lcu; + ctuWidth = XAVS2_MIN(img_width - ctuXPos, size_lcu); + + //derive CTU boundary availabilities + deriveBoundaryAvail(h, ctuXPos, ctuYPos, + &isLeftAvail, &isRightAvail, &isAboveAvail, &isBelowAvail); + + for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { + //if slice-level enabled flag is 0, set CTB-level enabled flag 0 + if (alfPictureParam[compIdx].alf_flag == 0) { + h->is_alf_lcu_on[ctu][compIdx] = FALSE; + continue; + } + + formatShift = (compIdx == IMG_Y) ? 0 : 1; + p_org_pixel = p_org->planes10[compIdx]; + i_org = p_org->i_stride[compIdx]; + p_rec_before = p_rec->planes10[compIdx]; + i_rec_before = p_rec->i_stride[compIdx]; + p_rec_after = p_dst->planes10[compIdx]; + i_rec_after = p_dst->i_stride[compIdx]; + + // ALF on + filterOneCTB10(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx, + &alfPictureParam[compIdx], ctuYPos >> formatShift, ctuHeight >> formatShift, + ctuXPos >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail); + distEnc = calcAlfLCUDist10(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, + ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_after, i_rec_after); + distEnc -= calcAlfLCUDist10(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, + ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_before, i_rec_before); + + h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); + + rateEnc = p_aec->binary.write_alf_lcu_ctrl(p_aec, 1); + + costEnc = (double)distEnc + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateEnc; + + // ALF off + distOff = 0; + //rateOff = 1; + h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); + rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, 0); + + costOff = (double)distOff + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateOff; + + //set CTB-level on/off flag + h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE; + + if (!h->is_alf_lcu_on[ctu][compIdx]) { + copyOneAlfBlk10(h, p_rec_after, i_rec_after, p_rec_before, i_rec_before, + ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, + isAboveAvail, isBelowAvail); + } + + //update CABAC status + //cabacCoder->updateAlfCtrlFlagState(m_pcPic->getCU(ctu)->getAlfLCUEnabled(compIdx)?1:0); + + h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); + rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, (h->is_alf_lcu_on[ctu][compIdx] ? 1 : 0)); + h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); + + rateBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? rateEnc : rateOff); + distBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? distEnc : distOff); + + } //CTB + } + } //CTU + + for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { + if (alfPictureParam[compIdx].alf_flag == 1) { + double Lambda = (compIdx == 0 ? lambda_luma : lambda_chroma); + rate = ALFParamBitrateEstimate(&alfPictureParam[compIdx]); + if (compIdx == IMG_Y) { + noFilters = alfPictureParam[0].filters_per_group - 1; + rate += uvlc_bitrate_estimate[noFilters] + (4 * noFilters); + } + costAlfOn = (double)distBestPic[compIdx] + Lambda * + (rateBestPic[compIdx] + (double)(rate)); + + costAlfOff = 0; + + if (costAlfOn >= costAlfOff) { + alfPictureParam[compIdx].alf_flag = 0; + for (ctu = 0; ctu < NumCUsInFrame; ctu++) { + h->is_alf_lcu_on[ctu][compIdx] = FALSE; + } + + g_funcs.plane_copy10(h, p_dst->planes10[compIdx], p_dst->i_stride[compIdx], + p_rec->planes10[compIdx], p_rec->i_stride[compIdx], + p_rec->i_width[compIdx], p_rec->i_lines[compIdx]); + } + } + } + } } /* --------------------------------------------------------------------------- @@ -1209,7 +1654,7 @@ static void gnsBacksubstitution(double R[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], do /* --------------------------------------------------------------------------- */ -static int gnsCholeskyDec(int64_t inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double outMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int noEq) +static int gnsCholeskyDec(long long int inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double outMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int noEq) { int i, j, k; /* Looping Variables */ double scale; /* scaling factor for each row */ @@ -1245,7 +1690,7 @@ static int gnsCholeskyDec(int64_t inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], d /* --------------------------------------------------------------------------- */ -static int gnsSolveByChol(int64_t LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *rhs, double *x, int noEq) +static int gnsSolveByChol(long long int LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *rhs, double *x, int noEq) { double aux[ALF_MAX_NUM_COEF]; /* Auxiliary vector */ double U[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; /* Upper triangular Cholesky factor of LHS */ @@ -1291,7 +1736,7 @@ static int gnsSolveByChol(int64_t LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], doubl /* --------------------------------------------------------------------------- */ -static double calculateErrorAbs(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double y, int size) +static double calculateErrorAbs(long long int A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double y, int size) { int i; double error, sum; @@ -1311,7 +1756,7 @@ static double calculateErrorAbs(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], d /* --------------------------------------------------------------------------- */ static -double mergeFiltersGreedy(alf_ctx_t *Enc_ALF, double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], +double mergeFiltersGreedy(alf_ctx_t *Enc_ALF, double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAccGlobalSeq, int intervalBest[NO_VAR_BINS][2], int sqrFiltLength, int noIntervals) { int first, ind, ind1, ind2, i, j, bestToMerge; @@ -1465,7 +1910,7 @@ static double xfindBestCoeffCodMethod(int filterCoeffSymQuant[][ALF_MAX_NUM_COEF /* --------------------------------------------------------------------------- */ -static void add_A(int64_t Amerged[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t A[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size) +static void add_A(int64_t Amerged[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int A[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size) { int i, j, ind; @@ -1527,7 +1972,7 @@ static double calculateErrorCoeffProvided(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NU /* --------------------------------------------------------------------------- */ -static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant, int64_t E[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *y, int sqrFiltLength) +static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant, long long int E[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *y, int sqrFiltLength) { double error; int filterCoeffQuantMod[ALF_MAX_NUM_COEF]; @@ -1598,7 +2043,7 @@ static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant /* --------------------------------------------------------------------------- */ -static double findFilterCoeff(alf_ctx_t *Enc_ALF, int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], +static double findFilterCoeff(alf_ctx_t *Enc_ALF, long long int EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAccGlobalSeq, int filterCoeffSeq[][ALF_MAX_NUM_COEF], int filterCoeffQuantSeq[][ALF_MAX_NUM_COEF], int intervalBest[NO_VAR_BINS][2], int varIndTab[NO_VAR_BINS], int sqrFiltLength, int filters_per_fr, double errorTabForce0Coeff[NO_VAR_BINS][2]) { @@ -1635,7 +2080,7 @@ static double findFilterCoeff(alf_ctx_t *Enc_ALF, int64_t EGlobalSeq[][ALF_MAX_N /* --------------------------------------------------------------------------- */ static -void xfindBestFilterVarPred(alf_ctx_t *Enc_ALF, double ySym[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t ESym[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], +void xfindBestFilterVarPred(alf_ctx_t *Enc_ALF, double ySym[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], long long int ESym[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAcc, int filterCoeffSym[][ALF_MAX_NUM_COEF], int *filters_per_fr_best, int varIndTab[], double lambda_val, int numMaxFilters) { int filterCoeffSymQuant[NO_VAR_BINS][ALF_MAX_NUM_COEF]; @@ -1816,7 +2261,7 @@ void deriveFilterInfo(alf_ctx_t *Enc_ALF, ALFParam *alfPictureParam, AlfCorrData * Input: * alfPictureParam: The ALF parameter * apsId: The ALF parameter index in the buffer - * isNewApsSent:The New flag index + * isNewApsSent锛歍he New flag index * lambda : The lambda value in the ALF-RD decision * Return: * --------------------------------------------------------------------------- @@ -1889,7 +2334,7 @@ int alf_get_buffer_size(const xavs2_param_t *param) */ void alf_init_buffer(xavs2_t *h, uint8_t *mem_base) { - // 希尔伯特扫描顺序 + // 甯屽皵浼壒鎵弿椤哄簭 static const uint8_t regionTable[NO_VAR_BINS] = { 0, 1, 4, 5, 15, 2, 3, 6, 14, 11, 10, 7, 13, 12, 9, 8 } diff --git a/source/encoder/encoder.c b/source/encoder/encoder.c index d5b3890..bf739bc 100644 --- a/source/encoder/encoder.c +++ b/source/encoder/encoder.c @@ -97,6 +97,7 @@ extern double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH]; static ALWAYS_INLINE void qsfd_calculate_threshold_of_a_frame(xavs2_t *h) { + double tab_qsfd_thres[MAX_QP + (h->param->sample_bit_depth - 8) * 8][2][CTU_DEPTH]; assert(sizeof(h->thres_qsfd_cu) == sizeof(tab_qsfd_thres[0])); memcpy(h->thres_qsfd_cu, tab_qsfd_thres[h->i_qp], sizeof(h->thres_qsfd_cu)); @@ -187,8 +188,8 @@ void encoder_output_frame_bitstream(xavs2_handler_t *h_mgr, xavs2_frame_t *frame */ void encoder_fetch_one_encoded_frame(xavs2_handler_t *h_mgr, xavs2_outpacket_t *packet, int is_flush) { - int num_encoding_frames = h_mgr->num_encode - h_mgr->num_output; // 正在编码帧数 - int num_frames_threads = h_mgr->i_frm_threads; // 并行帧数 + int num_encoding_frames = h_mgr->num_encode - h_mgr->num_output; // 姝e湪缂栫爜甯ф暟 + int num_frames_threads = h_mgr->i_frm_threads; // 骞惰甯ф暟 /* clear packet data */ packet->len = 0; @@ -695,7 +696,7 @@ static void *encoder_aec_encode_one_frame(xavs2_t *h) xavs2_lcu_terminat_bit_write(p_aec, lcu_xy == slice->i_last_lcu_xy); } - /* 仅考虑LCU行级的Slice划分方式 */ + /* 浠呰冭檻LCU琛岀骇鐨凷lice鍒掑垎鏂瑰紡 */ if (lcu_xy >= slice->i_last_lcu_xy) { int bs_len; /* slice done */ @@ -895,7 +896,7 @@ static void encoder_decide_level_id(xavs2_param_t *param) { const int tab_level_restriction[][5] = { /* LevelID, MaxWidth, MaxHeight, MaxFps, MaxKBps */ - { 0x00, 8192, 8192, 0, 0 }, // 禁止 + { 0x00, 8192, 8192, 0, 0 }, // 绂佹 { 0x10, 352, 288, 15, 1500 }, // 2.0.15 { 0x12, 352, 288, 30, 2000 }, // 2.0.30 { 0x14, 352, 288, 60, 2500 }, // 2.0.60 @@ -919,14 +920,14 @@ static void encoder_decide_level_id(xavs2_param_t *param) { 0x66, 8192, 4608, 60, 480000 }, // 10.2.60 { 0x68, 8192, 4608, 120, 240000 }, // 10.0.120 { 0x6A, 8192, 4608, 120, 800000 }, // 10.2.120 - { 0x00, 16384, 8192, 120, 8000000 }, // 禁止 + { 0x00, 16384, 8192, 120, 8000000 }, // 绂佹 }; int i = 1; int i_last_level = 0; for (; tab_level_restriction[i][4] != 0;) { - /* 未开启码控时,设置为最大 */ + /* 鏈紑鍚爜鎺ф椂锛岃缃负鏈澶 */ if (param->i_rc_method == 0 && param->org_width <= tab_level_restriction[i_last_level][1] && param->org_height <= tab_level_restriction[i_last_level][2] && @@ -934,16 +935,16 @@ static void encoder_decide_level_id(xavs2_param_t *param) param->org_height <= tab_level_restriction[i][2] && tab_level_restriction[i_last_level][1] < tab_level_restriction[i][1] && tab_level_restriction[i_last_level][2] < tab_level_restriction[i][2]) { - /* 码率控制未开启时,选择满足条件的分辨率下的最高档 */ + /* 鐮佺巼鎺у埗鏈紑鍚椂锛岄夋嫨婊¤冻鏉′欢鐨勫垎杈ㄧ巼涓嬬殑鏈楂樻。 */ i = i_last_level; break; } - /* 分辨率、帧率符合要求 */ + /* 鍒嗚鲸鐜囥佸抚鐜囩鍚堣姹 */ if (param->org_width <= tab_level_restriction[i][1] && param->org_height <= tab_level_restriction[i][2] && param->frame_rate <= tab_level_restriction[i][3]) { i_last_level = i; - /* 比特率已设定,可根据最大码率设置LevelID */ + /* 姣旂壒鐜囧凡璁惧畾锛屽彲鏍规嵁鏈澶х爜鐜囪缃甃evelID */ if (param->i_rc_method != 0 && param->i_target_bitrate * 1.5 <= tab_level_restriction[i][4] * 1000 && param->bitrate_upper <= tab_level_restriction[i][4] * 1000) { @@ -1001,8 +1002,8 @@ int encoder_check_parameters(xavs2_param_t *param) return -1; } - /* 多Slice下不能开启 cross slice loop filter,会影响并行效率 - * TODO: 后续可支持 */ + /* 澶歋lice涓嬩笉鑳藉紑鍚 cross slice loop filter锛屼細褰卞搷骞惰鏁堢巼 + * TODO: 鍚庣画鍙敮鎸 */ if (param->slice_num > 1 && param->b_cross_slice_loop_filter != FALSE) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Un-supported cross slice loop filter, forcing not filtering\n"); param->b_cross_slice_loop_filter = FALSE; @@ -1059,7 +1060,7 @@ int encoder_check_parameters(xavs2_param_t *param) } /* check bit depth */ - if (param->profile_id != MAIN_PROFILE) { + if (param->profile_id != MAIN_PROFILE && param->sample_bit_depth == 8) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Not Supported profile \"%d\", HIGH_BIT_DEPTH macro haven`t turn on!\n", param->profile_id); return -1; @@ -1111,8 +1112,10 @@ int encoder_check_parameters(xavs2_param_t *param) } } + int max_qp = MAX_QP + (param->sample_bit_depth - 8) * 8; + /* check QP */ - if (param->i_initial_qp > MAX_QP || param->i_initial_qp < MIN_QP) { + if (param->i_initial_qp > max_qp || param->i_initial_qp < MIN_QP) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter quant_0, check configuration file\n"); return -1; } @@ -1285,18 +1288,20 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en size_extra_frame_buffer = (param->enable_tdrdo + param->enable_sao + param->enable_alf) * xavs2_frame_buffer_size(param, FT_TEMP); /* compute the space size and alloc buffer */ + if (param->input_sample_bit_depth == 8) { mem_size = sizeof(xavs2_t) + /* xavs2_t */ sizeof(nal_t) * (MAX_SLICES + 6) + /* all nal units */ sizeof(uint8_t) * XAVS2_BS_HEAD_LEN + /* bitstream buffer (frame header only) */ sizeof(uint8_t) * bs_size + /* bitstream buffer for all slices */ sizeof(slice_t) * MAX_SLICES + /* slice array */ - sizeof(pel_t) * (frame_w * 2) * num_slices + /* buffer for intra_border */ + sizeof(pel8_t) * (frame_w * 2) * num_slices + /* buffer for intra_border */ sizeof(uint8_t) * w_in_scu * 32 * num_slices + /* buffer for edge filter flag (of one LCU row) */ sizeof(int8_t) * ipm_size * num_slices + /* intra prediction mode buffer */ sizeof(int8_t) * size_4x4 + /* inter prediction direction */ sizeof(int8_t) * size_4x4 * 2 + /* reference frames */ sizeof(mv_t) * size_4x4 * 2 + /* reference motion vectors */ CACHE_LINE_SIZE * (MAX_SLICES + 32); + mem_size += qpel_frame_size * 3 * sizeof(mct_t) + /* temporary buffer for 1/4 interpolation: a,1,b */ xavs2_me_get_buf_size(param) + /* buffers in me module */ @@ -1315,7 +1320,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en /* alloc memory space */ mem_size = ((mem_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE; - CHECKED_MALLOC(mem_base, uint8_t *, mem_size); + CHECKED_MALLOC8(mem_base, uint8_t *, mem_size); /* assign handle pointer of the xavs2 encoder */ h = (xavs2_t *)mem_base; @@ -1390,14 +1395,14 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en ALIGN_POINTER(mem_base); /* align pointer */ /* assign pointer to intra_border buffer */ - p_slice->slice_intra_border[0] = (pel_t *)mem_base; - mem_base += h->i_width * sizeof(pel_t); + p_slice->slice_intra_border8[0] = (pel8_t *)mem_base; + mem_base += h->i_width * sizeof(pel8_t); ALIGN_POINTER(mem_base); - p_slice->slice_intra_border[1] = (pel_t *)mem_base; - mem_base += (h->i_width / 2) * sizeof(pel_t); + p_slice->slice_intra_border8[1] = (pel8_t *)mem_base; + mem_base += (h->i_width / 2) * sizeof(pel8_t); ALIGN_POINTER(mem_base); - p_slice->slice_intra_border[2] = (pel_t *)mem_base; - mem_base += (h->i_width / 2) * sizeof(pel_t); + p_slice->slice_intra_border8[2] = (pel8_t *)mem_base; + mem_base += (h->i_width / 2) * sizeof(pel8_t); ALIGN_POINTER(mem_base); /* buffer for edge filter flag (of one LCU row) */ @@ -1408,7 +1413,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en ALIGN_POINTER(mem_base); } - slice_init_bufer(h, h->slices[0]); + slice_init_bufer8(h, h->slices[0]); /* ------------------------------------------------------------- * fenc fdec @@ -1421,14 +1426,14 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en */ /* assign pointers for p_fenc (Y/U/V pointers) */ - h->lcu.p_fenc[0] = h->lcu.fenc_buf; - h->lcu.p_fenc[1] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE; - h->lcu.p_fenc[2] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2); + h->lcu.p_fenc8[0] = h->lcu.fenc_buf8; + h->lcu.p_fenc8[1] = h->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE; + h->lcu.p_fenc8[2] = h->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2); /* assign pointers for p_fdec (Y/U/V pointers) */ - h->lcu.p_fdec[0] = h->lcu.fdec_buf; - h->lcu.p_fdec[1] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE; - h->lcu.p_fdec[2] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2); + h->lcu.p_fdec8[0] = h->lcu.fdec_buf8; + h->lcu.p_fdec8[1] = h->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE; + h->lcu.p_fdec8[2] = h->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2); /* slice index of CTUs */ h->lcu_slice_idx = (int8_t *)mem_base; @@ -1512,11 +1517,11 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en mem_base += sizeof(lcu_info_t) * w_in_lcu; if (xavs2_thread_mutex_init(&row->mutex, NULL)) { - goto fail; + goto fail8; } if (xavs2_thread_cond_init(&row->cond, NULL)) { - goto fail; + goto fail8; } } @@ -1573,7 +1578,7 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en if ((uintptr_t)(h) + mem_size < (uintptr_t)(mem_base)) { /* malloc size allocation error: no enough memory */ - goto fail; + goto fail8; } /* ------------------------------------------------------------- * init other properties/modules for xavs2 encoder @@ -1591,8 +1596,319 @@ xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_en return h; -fail: +fail8: + return NULL; + } else { + mem_size = sizeof(xavs2_t) + /* xavs2_t */ + sizeof(nal_t) * (MAX_SLICES + 6) + /* all nal units */ + sizeof(uint8_t) * XAVS2_BS_HEAD_LEN + /* bitstream buffer (frame header only) */ + sizeof(uint8_t) * bs_size + /* bitstream buffer for all slices */ + sizeof(slice_t) * MAX_SLICES + /* slice array */ + sizeof(pel10_t) * (frame_w * 2) * num_slices + /* buffer for intra_border */ + sizeof(uint8_t) * w_in_scu * 32 * num_slices + /* buffer for edge filter flag (of one LCU row) */ + sizeof(int8_t) * ipm_size * num_slices + /* intra prediction mode buffer */ + sizeof(int8_t) * size_4x4 + /* inter prediction direction */ + sizeof(int8_t) * size_4x4 * 2 + /* reference frames */ + sizeof(mv_t) * size_4x4 * 2 + /* reference motion vectors */ + CACHE_LINE_SIZE * (MAX_SLICES + 32); + + mem_size += + qpel_frame_size * 3 * sizeof(mct_t) + /* temporary buffer for 1/4 interpolation: a,1,b */ + xavs2_me_get_buf_size(param) + /* buffers in me module */ + info_size + /* the frame info structure */ + frame_size_in_scu * sizeof(cu_info_t) + /* CU data */ + num_me_bytes + /* Motion Estimation */ + w_in_lcu * h_in_lcu * sizeof(int8_t) + /* CTU slice index */ + size_extra_frame_buffer + /* extra frame buffer: TDRDO, SAO, ALF */ + + size_sao_stats + CACHE_LINE_SIZE + /* SAO stat data */ + size_sao_param + CACHE_LINE_SIZE + /* SAO parameters */ + size_sao_onoff + CACHE_LINE_SIZE + /* SAO on/off number of LCU row */ + + size_alf + CACHE_LINE_SIZE + /* ALF encoder contexts */ + CACHE_LINE_SIZE * 30; /* used for align buffer */ + + /* alloc memory space */ + mem_size = ((mem_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE; + CHECKED_MALLOC10(mem_base, uint8_t *, mem_size); + + /* assign handle pointer of the xavs2 encoder */ + h = (xavs2_t *)mem_base; + memset(h, 0, sizeof(xavs2_t)); + mem_base += sizeof(xavs2_t); + ALIGN_POINTER(mem_base); /* align pointer */ + + /* init log module */ + h->module_log.i_log_level = param->i_log_level; + sprintf(h->module_log.module_name, "Enc[%2d] %06llx", idx_frm_encoder, (uintptr_t)(h)); + + /* copy the input parameters */ + h->param = param; + + /* const properties */ + h->i_width = frame_w; + h->i_height = frame_h; + h->i_width_in_lcu = w_in_lcu; + h->i_height_in_lcu = h_in_lcu; + h->i_width_in_mincu = w_in_scu; + h->i_height_in_mincu = h_in_scu; + h->i_width_in_minpu = w_in_4x4; + h->i_height_in_minpu = h_in_4x4; + + h->framerate = h->param->frame_rate; + + h->i_lcu_level = h->param->lcu_bit_level; + h->i_scu_level = h->param->scu_bit_level; + h->i_chroma_v_shift = h->param->chroma_format == CHROMA_420; + h->i_max_ref = h->param->num_max_ref; + h->b_progressive = (bool_t)h->param->progressive_frame; + h->b_field_sequence = (h->param->InterlaceCodingOption == FIELD_CODING); + + /* set table which indicates numbers of intra prediction modes for RDO */ + for (i = 0; i < MAX_CU_SIZE_IN_BIT; i++) { + h->tab_num_intra_rdo[i] = 1; /* this will later be set according to the preset level */ + } + h->num_rdo_intra_chroma = NUM_INTRA_MODE_CHROMA; + + /* ------------------------------------------------------------- + * assign buffer pointers of xavs2 encoder + */ + + /* point to all nal units */ + h->p_nal = (nal_t *)mem_base; + mem_base += sizeof(nal_t) * (MAX_SLICES + 6); + ALIGN_POINTER(mem_base); /* align pointer */ + + /* bitstream buffer (frame header) */ + h->p_bs_buf_header = mem_base; + h->i_bs_buf_header = sizeof(uint8_t) * XAVS2_BS_HEAD_LEN; + mem_base += sizeof(uint8_t) * XAVS2_BS_HEAD_LEN; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* bitstream buffer for all slices */ + h->p_bs_buf_slice = mem_base; + h->i_bs_buf_slice = sizeof(uint8_t) * bs_size; + mem_base += sizeof(uint8_t) * bs_size; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* slice array */ + for (i = 0; i < num_slices; i++) { + slice_t *p_slice = (slice_t *)mem_base; + h->slices[i] = p_slice; + mem_base += sizeof(slice_t); + ALIGN_POINTER(mem_base); /* align pointer */ + + /* intra prediction mode buffer */ + p_slice->slice_ipredmode = (int8_t *)mem_base; + mem_base += sizeof(int8_t) * ipm_size; + p_slice->slice_ipredmode += (h->i_width_in_minpu + 16) + 16; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* assign pointer to intra_border buffer */ + p_slice->slice_intra_border10[0] = (pel10_t *)mem_base; + mem_base += h->i_width * sizeof(pel10_t); + ALIGN_POINTER(mem_base); + p_slice->slice_intra_border10[1] = (pel10_t *)mem_base; + mem_base += (h->i_width / 2) * sizeof(pel10_t); + ALIGN_POINTER(mem_base); + p_slice->slice_intra_border10[2] = (pel10_t *)mem_base; + mem_base += (h->i_width / 2) * sizeof(pel10_t); + ALIGN_POINTER(mem_base); + + /* buffer for edge filter flag (of one LCU row) */ + p_slice->slice_deblock_flag[0] = (uint8_t *)mem_base; + mem_base += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t); + p_slice->slice_deblock_flag[1] = (uint8_t *)mem_base; + mem_base += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t); + ALIGN_POINTER(mem_base); + } + + slice_init_bufer10(h, h->slices[0]); + + /* ------------------------------------------------------------- + * fenc fdec + * Y Y Y Y Y Y Y Y + * Y Y Y Y Y Y Y Y + * Y Y Y Y Y Y Y Y + * Y Y Y Y Y Y Y Y + * U U V V U U V V + * U U V V U U V V + */ + + /* assign pointers for p_fenc (Y/U/V pointers) */ + h->lcu.p_fenc10[0] = h->lcu.fenc_buf10; + h->lcu.p_fenc10[1] = h->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE; + h->lcu.p_fenc10[2] = h->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2); + + /* assign pointers for p_fdec (Y/U/V pointers) */ + h->lcu.p_fdec10[0] = h->lcu.fdec_buf10; + h->lcu.p_fdec10[1] = h->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE; + h->lcu.p_fdec10[2] = h->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2); + + /* slice index of CTUs */ + h->lcu_slice_idx = (int8_t *)mem_base; + mem_base += w_in_lcu * h_in_lcu * sizeof(int8_t); + ALIGN_POINTER(mem_base); /* align pointer */ + + /* inter prediction mode */ + h->dir_pred = (int8_t *)mem_base; + mem_base += sizeof(int8_t) * size_4x4; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* reference frames */ + h->fwd_1st_ref = (int8_t *)mem_base; + mem_base += sizeof(int8_t) * size_4x4; + ALIGN_POINTER(mem_base); /* align pointer */ + h->bwd_2nd_ref = (int8_t *)mem_base; + mem_base += sizeof(int8_t) * size_4x4; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* reference motion vectors */ + h->fwd_1st_mv = (mv_t *)mem_base; + mem_base += sizeof(mv_t) * size_4x4; + ALIGN_POINTER(mem_base); /* align pointer */ + h->bwd_2nd_mv = (mv_t *)mem_base; + mem_base += sizeof(mv_t) * size_4x4; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* temporary buffer for 1/4 interpolation: a,1,b, alone buffer */ + h->img4Y_tmp[0] = (mct_t *)mem_base; + h->img4Y_tmp[1] = h->img4Y_tmp[0] + qpel_frame_size; + h->img4Y_tmp[2] = h->img4Y_tmp[0] + qpel_frame_size * 2; + mem_base += qpel_frame_size * 3 * sizeof(mct_t); + ALIGN_POINTER(mem_base); + + /* SAO data */ + h->sao_stat_datas = (SAOStatData (*)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES])mem_base; + memset(h->sao_stat_datas[0], 0, size_sao_stats); + mem_base += size_sao_stats; + ALIGN_POINTER(mem_base); + + h->sao_blk_params = (SAOBlkParam (*)[NUM_SAO_COMPONENTS])mem_base; + memset(h->sao_blk_params[0], 0, size_sao_param); + mem_base += size_sao_param; + ALIGN_POINTER(mem_base); + + h->num_sao_lcu_off = (int (*)[NUM_SAO_COMPONENTS])mem_base; + memset(h->num_sao_lcu_off[0], 0, size_sao_onoff); + mem_base += size_sao_onoff; + ALIGN_POINTER(mem_base); + + + /* init memory space in me module */ + xavs2_me_init(h, &mem_base); + + /* allocate frame_info_t (one for each frame context) */ + h->frameinfo = (frame_info_t *)mem_base; + mem_base += sizeof(frame_info_t); + ALIGN_POINTER(mem_base); /* align pointer */ + + h->frameinfo->rows = (row_info_t *)mem_base; + mem_base += sizeof(row_info_t) * h_in_lcu; + ALIGN_POINTER(mem_base); /* align pointer */ + + /* set available tables */ + set_available_tables(h); + + /* assign pointers for all coding tree units */ + h->lcu.p_ctu = &h->lcu.all_cu[0]; + h->lcu.i_scu_xy = 1; // borrowed + build_coding_tree(h, h->lcu.p_ctu, 0, h->i_lcu_level, 0, 0); + h->lcu.i_scu_xy = 0; // reset + + /* set row info */ + for (i = 0; i < h_in_lcu; i++) { + row_info_t *row = &h->frameinfo->rows[i]; + + row->h = 0; + row->row = i; + row->coded = -1; + row->lcus = (lcu_info_t *)mem_base; + mem_base += sizeof(lcu_info_t) * w_in_lcu; + + if (xavs2_thread_mutex_init(&row->mutex, NULL)) { + goto fail10; + } + + if (xavs2_thread_cond_init(&row->cond, NULL)) { + goto fail10; + } + } + + /* check memory size */ + ALIGN_POINTER(mem_base); /* align pointer */ + + /* ------------------------------------------------------------- + * allocate other alone spaces for xavs2 encoder + */ + + h->cu_info = (cu_info_t *)mem_base; + mem_base += frame_size_in_scu * sizeof(cu_info_t); + ALIGN_POINTER(mem_base); + + p_cu_info = h->cu_info; + for (j = 0; j < h_in_scu; j++) { + for (i = 0; i < w_in_scu; i++) { + scu_xy++; + p_cu_info->i_scu_x = i; + p_cu_info->i_scu_y = j; + p_cu_info++; + } + } + + /* motion estimation buffer */ + h->all_mincost = (dist_t(*)[MAX_INTER_MODES][MAX_REFS])mem_base; + mem_base += num_me_bytes; + ALIGN_POINTER(mem_base); + + // allocate memory for current frame + if (h->param->enable_tdrdo) { + h->img_luma_pre = xavs2_frame_new(h, &mem_base, FT_TEMP); + ALIGN_POINTER(mem_base); + } else { + h->img_luma_pre = NULL; + } + + if (h->param->enable_sao) { + h->img_sao = xavs2_frame_new(h, &mem_base, FT_TEMP); + ALIGN_POINTER(mem_base); + } else { + h->img_sao = NULL; + } + + if (h->param->enable_alf) { + h->img_alf = xavs2_frame_new(h, &mem_base, FT_TEMP); + ALIGN_POINTER(mem_base); + alf_init_buffer(h, mem_base); + mem_base += size_alf; + ALIGN_POINTER(mem_base); + } else { + h->img_alf = NULL; + } + + if ((uintptr_t)(h) + mem_size < (uintptr_t)(mem_base)) { + /* malloc size allocation error: no enough memory */ + goto fail10; + } + /* ------------------------------------------------------------- + * init other properties/modules for xavs2 encoder + */ + + /* init all slices */ + xavs2_slices_init(h); + +#if ENABLE_WQUANT + /* adaptive frequency weighting quantization */ + if (h->param->enable_wquant) { + xavs2_wq_init_seq_quant_param(h); + } +#endif + + return h; + +fail10: return NULL; + } } /* --------------------------------------------------------------------------- @@ -1632,8 +1948,9 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr) /* ------------------------------------------------------------- * build lcu row encoding contexts */ + if (h->param->input_sample_bit_depth == 8) { if (h_mgr->num_row_contexts > 1) { - CHECKED_MALLOC(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t)); + CHECKED_MALLOC8(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t)); for (i = 0; i < h_mgr->num_row_contexts; i++) { xavs2_t *h_row_coder = &h_mgr->row_contexts[i]; @@ -1654,14 +1971,14 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr) h_row_coder->lcu.i_scu_xy = 0; // reset /* assign pointers for p_fenc (Y/U/V pointers) */ - h_row_coder->lcu.p_fenc[0] = h_row_coder->lcu.fenc_buf; - h_row_coder->lcu.p_fenc[1] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE; - h_row_coder->lcu.p_fenc[2] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2; + h_row_coder->lcu.p_fenc8[0] = h_row_coder->lcu.fenc_buf8; + h_row_coder->lcu.p_fenc8[1] = h_row_coder->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE; + h_row_coder->lcu.p_fenc8[2] = h_row_coder->lcu.fenc_buf8 + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2; /* assign pointers for p_fdec (Y/U/V pointers) */ - h_row_coder->lcu.p_fdec[0] = h_row_coder->lcu.fdec_buf; - h_row_coder->lcu.p_fdec[1] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE; - h_row_coder->lcu.p_fdec[2] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2; + h_row_coder->lcu.p_fdec8[0] = h_row_coder->lcu.fdec_buf8; + h_row_coder->lcu.p_fdec8[1] = h_row_coder->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE; + h_row_coder->lcu.p_fdec8[2] = h_row_coder->lcu.fdec_buf8 + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2; } } @@ -1670,7 +1987,7 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr) h_mgr->frm_contexts[0] = h; /* context 0 is the main encoder handle */ for (i = 1; i < h_mgr->i_frm_threads; i++) { if ((h_mgr->frm_contexts[i] = encoder_create_frame_context(h->param, i)) == 0) { - goto fail; + goto fail8; } memcpy(&h_mgr->frm_contexts[i]->communal_vars_1, &h->communal_vars_1, @@ -1679,8 +1996,59 @@ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr) return 0; -fail: +fail8: + return -1; + } else { + if (h_mgr->num_row_contexts > 1) { + CHECKED_MALLOC10(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t)); + + for (i = 0; i < h_mgr->num_row_contexts; i++) { + xavs2_t *h_row_coder = &h_mgr->row_contexts[i]; + + memcpy(&h_row_coder->communal_vars_1, &h->communal_vars_1, + (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1); + + /* identify ourself */ + h_row_coder->task_type = XAVS2_TASK_ROW; + + /* we are free */ + h_row_coder->i_aec_frm = -1; + + /* assign pointers for all coding tree units */ + h_row_coder->lcu.p_ctu = &h_row_coder->lcu.all_cu[0]; + h_row_coder->lcu.i_scu_xy = 1; // borrowed + build_coding_tree(h_row_coder, h_row_coder->lcu.p_ctu, 0, h_row_coder->i_lcu_level, 0, 0); + h_row_coder->lcu.i_scu_xy = 0; // reset + + /* assign pointers for p_fenc (Y/U/V pointers) */ + h_row_coder->lcu.p_fenc10[0] = h_row_coder->lcu.fenc_buf10; + h_row_coder->lcu.p_fenc10[1] = h_row_coder->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE; + h_row_coder->lcu.p_fenc10[2] = h_row_coder->lcu.fenc_buf10 + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2; + + /* assign pointers for p_fdec (Y/U/V pointers) */ + h_row_coder->lcu.p_fdec10[0] = h_row_coder->lcu.fdec_buf10; + h_row_coder->lcu.p_fdec10[1] = h_row_coder->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE; + h_row_coder->lcu.p_fdec10[2] = h_row_coder->lcu.fdec_buf10 + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2; + } + } + + /* ------------------------------------------------------------- + * build frame encoding contexts */ + h_mgr->frm_contexts[0] = h; /* context 0 is the main encoder handle */ + for (i = 1; i < h_mgr->i_frm_threads; i++) { + if ((h_mgr->frm_contexts[i] = encoder_create_frame_context(h->param, i)) == 0) { + goto fail10; + } + + memcpy(&h_mgr->frm_contexts[i]->communal_vars_1, &h->communal_vars_1, + (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1); + } + + return 0; + +fail10: return -1; + } } /* --------------------------------------------------------------------------- @@ -1837,12 +2205,22 @@ static void init_decoding_frame(xavs2_t *h) static void encoder_init_func_handles(xavs2_t *h) { /* set some function handles according option or preset level */ + if (h->param->input_sample_bit_depth == 8) { if (h->param->enable_hadamard) { - g_funcs.pixf.intra_cmp = g_funcs.pixf.satd; - g_funcs.pixf.fpel_cmp = g_funcs.pixf.satd; + g_funcs.pixf.intra8_cmp = g_funcs.pixf.satd8; + g_funcs.pixf.fpel8_cmp = g_funcs.pixf.satd8; } else { - g_funcs.pixf.intra_cmp = g_funcs.pixf.sad; - g_funcs.pixf.fpel_cmp = g_funcs.pixf.sad; + g_funcs.pixf.intra8_cmp = g_funcs.pixf.sad8; + g_funcs.pixf.fpel8_cmp = g_funcs.pixf.sad8; + } + } else { + if (h->param->enable_hadamard) { + g_funcs.pixf.intra10_cmp = g_funcs.pixf.satd10; + g_funcs.pixf.fpel10_cmp = g_funcs.pixf.satd10; + } else { + g_funcs.pixf.intra10_cmp = g_funcs.pixf.sad10; + g_funcs.pixf.fpel10_cmp = g_funcs.pixf.sad10; + } } } @@ -1992,7 +2370,7 @@ void xavs2e_frame_coding_init(xavs2_t *h) /* encoding begin ---------------------------------------------- */ - /* 帧级其他参数初始化 */ + /* 甯х骇鍏朵粬鍙傛暟鍒濆鍖 */ if (IS_ALG_ENABLE(OPT_CU_QSFD)) { qsfd_calculate_threshold_of_a_frame(h); } @@ -2038,7 +2416,7 @@ void *xavs2e_encode_one_frame(void *arg) /* start AEC frame coding */ if (h->h_top->threadpool_aec != NULL && !h->param->enable_alf) { - xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0); + xavs2_threadpool_run(h->h_top->threadpool_aec, (void * (*)(void *)) encoder_aec_encode_one_frame, h, 0); } /* (3) encode all LCU rows in current frame --------------------------- @@ -2051,44 +2429,44 @@ void *xavs2e_encode_one_frame(void *arg) h->i_slice_index = g_slice_lcu_row_order[i].slice_idx; - /* 是否需要额外处理Slice边界 */ + /* 鏄惁闇瑕侀澶栧鐞哠lice杈圭晫 */ row->b_top_slice_border = 0; row->b_down_slice_border = 0; - /* 当前帧内的依赖行 */ + /* 褰撳墠甯у唴鐨勪緷璧栬 */ if (row_type) { last_row = &rows[lcu_y - 1]; row->b_down_slice_border = (row_type == 2 && lcu_y != h->i_height_in_lcu - 1); } else { - xavs2_slice_write_start(h); /* Slice的第一行,初始化 */ + xavs2_slice_write_start(h); /* Slice鐨勭涓琛岋紝鍒濆鍖 */ last_row = NULL; row->b_top_slice_border = (lcu_y > 0); } - /* 等待参考帧中依赖的行编码完毕 */ + /* 绛夊緟鍙傝冨抚涓緷璧栫殑琛岀紪鐮佸畬姣 */ xavs2e_inter_sync(h, lcu_y, 0); /* encode one lcu row */ if (enable_wpp && i != h->i_height_in_lcu - 1) { - /* 1, 分配一个行级的线程进行编码 */ + /* 1, 鍒嗛厤涓涓绾х殑绾跨▼杩涜缂栫爜 */ if ((row->h = xavs2e_alloc_row_task(h)) == NULL) { return NULL; } - /* 2, 检查当前行是否应立刻启动; - * 规则为等待上一行至少完成两个LCU才启动线程,这里至少等待1个 + /* 2, 妫鏌ュ綋鍓嶈鏄惁搴旂珛鍒诲惎鍔紱 + * 瑙勫垯涓虹瓑寰呬笂涓琛岃嚦灏戝畬鎴愪袱涓狶CU鎵嶅惎鍔ㄧ嚎绋嬶紝杩欓噷鑷冲皯绛夊緟1涓 */ wait_lcu_row_coded(last_row, 0); - /* 3, 使用该行级线程进行编码 */ + /* 3, 浣跨敤璇ヨ绾х嚎绋嬭繘琛岀紪鐮 */ xavs2_threadpool_run(h->h_top->threadpool_rdo, xavs2_lcu_row_write, row, 0); } else { row->h = h; xavs2_lcu_row_write(row); } - /* 对Slice的最后一行LCU来说,需要合并多个Slice的码流 - * 但在RDO阶段,并不需要 */ + /* 瀵筍lice鐨勬渶鍚庝竴琛孡CU鏉ヨ锛岄渶瑕佸悎骞跺涓猄lice鐨勭爜娴 + * 浣嗗湪RDO闃舵锛屽苟涓嶉渶瑕 */ // if (h->param->slice_num > 1 && row_type == 2) { // nal_merge_slice(h, h->slices[h->i_slice_index]->p_bs_buf, h->i_nal_type, h->i_nal_ref_idc); // } @@ -2107,7 +2485,7 @@ void *xavs2e_encode_one_frame(void *arg) } } - /* (5) 统计SAO的开启和开关比率 */ + /* (5) 缁熻SAO鐨勫紑鍚拰寮鍏虫瘮鐜 */ if (h->param->enable_sao && (h->slice_sao_on[0] || h->slice_sao_on[1] || h->slice_sao_on[2])) { int sao_off_num_y = 0; int sao_off_num_u = 0; @@ -2132,7 +2510,7 @@ void *xavs2e_encode_one_frame(void *arg) xavs2_frame_copy_planes(h, h->img_alf, h->fdec); xavs2_frame_expand_border_frame(h, h->img_alf); alf_filter_one_frame(h); - /* 重新对重构图像边界进行扩展 */ + /* 閲嶆柊瀵归噸鏋勫浘鍍忚竟鐣岃繘琛屾墿灞 */ if (h->pic_alf_on[0] || h->pic_alf_on[1] || h->pic_alf_on[2]) { xavs2_frame_expand_border_frame(h, h->fdec); } @@ -2147,7 +2525,7 @@ void *xavs2e_encode_one_frame(void *arg) #endif if (h->h_top->threadpool_aec != NULL) { - xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0); + xavs2_threadpool_run(h->h_top->threadpool_aec, (void * (*)(void *)) encoder_aec_encode_one_frame, h, 0); } } diff --git a/source/encoder/encoder_report.c b/source/encoder/encoder_report.c index 9873c15..6849e66 100644 --- a/source/encoder/encoder_report.c +++ b/source/encoder/encoder_report.c @@ -72,21 +72,22 @@ void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v const int inout_shift = 0; uint64_t diff_y, diff_u, diff_v; + if (h->param->input_sample_bit_depth == 8) { /* luma */ - diff_y = xavs2_pixel_ssd_wxh(&g_funcs.pixf, - h->fenc->planes[0], h->fenc->i_stride[0], - h->fdec->planes[0], h->fdec->i_stride[0], i_width, i_height, inout_shift); + diff_y = xavs2_pixel_ssd8_wxh(&g_funcs.pixf, + h->fenc->planes8[0], h->fenc->i_stride[0], + h->fdec->planes8[0], h->fdec->i_stride[0], i_width, i_height, inout_shift); /* chroma */ if (h->param->chroma_format != CHROMA_400) { i_width >>= 1; i_height >>= 1; - diff_u = xavs2_pixel_ssd_wxh(&g_funcs.pixf, - h->fenc->planes[1], h->fenc->i_stride[1], - h->fdec->planes[1], h->fdec->i_stride[1], i_width, i_height, inout_shift); - diff_v = xavs2_pixel_ssd_wxh(&g_funcs.pixf, - h->fenc->planes[2], h->fenc->i_stride[2], - h->fdec->planes[2], h->fdec->i_stride[2], i_width, i_height, inout_shift); + diff_u = xavs2_pixel_ssd8_wxh(&g_funcs.pixf, + h->fenc->planes8[1], h->fenc->i_stride[1], + h->fdec->planes8[1], h->fdec->i_stride[1], i_width, i_height, inout_shift); + diff_v = xavs2_pixel_ssd8_wxh(&g_funcs.pixf, + h->fenc->planes8[2], h->fenc->i_stride[2], + h->fdec->planes8[2], h->fdec->i_stride[2], i_width, i_height, inout_shift); } else { diff_u = 0; diff_v = 0; @@ -98,6 +99,34 @@ void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v *psnr_y = get_psnr_with_ssd(f_max_signal, diff_y); *psnr_u = get_psnr_with_ssd(f_max_signal, diff_u * uvformat); *psnr_v = get_psnr_with_ssd(f_max_signal, diff_v * uvformat); + } else { + /* luma */ + diff_y = xavs2_pixel_ssd10_wxh(&g_funcs.pixf, + h->fenc->planes10[0], h->fenc->i_stride[0], + h->fdec->planes10[0], h->fdec->i_stride[0], i_width, i_height, inout_shift); + + /* chroma */ + if (h->param->chroma_format != CHROMA_400) { + i_width >>= 1; + i_height >>= 1; + diff_u = xavs2_pixel_ssd10_wxh(&g_funcs.pixf, + h->fenc->planes10[1], h->fenc->i_stride[1], + h->fdec->planes10[1], h->fdec->i_stride[1], i_width, i_height, inout_shift); + diff_v = xavs2_pixel_ssd10_wxh(&g_funcs.pixf, + h->fenc->planes10[2], h->fenc->i_stride[2], + h->fdec->planes10[2], h->fdec->i_stride[2], i_width, i_height, inout_shift); + } else { + diff_u = 0; + diff_v = 0; + } + + xavs2_emms(); /* call before using float instructions */ + + /* get the PSNR for current frame */ + *psnr_y = get_psnr_with_ssd(f_max_signal, diff_y); + *psnr_u = get_psnr_with_ssd(f_max_signal, diff_u * uvformat); + *psnr_v = get_psnr_with_ssd(f_max_signal, diff_v * uvformat); + } } /* --------------------------------------------------------------------------- @@ -136,12 +165,13 @@ double ssim_calculate_plane(xavs2_t *h, int comp_id) double C1 = k_ssim_1 * k_ssim_1 * uiMaxval * uiMaxval; double C2 = k_ssim_2 * k_ssim_2 * uiMaxval * uiMaxval; - pel_t* pOrg = h->fenc->planes[comp_id]; - pel_t* pRec = h->fdec->planes[comp_id]; + if (h->param->input_sample_bit_depth == 8) { + pel8_t* pOrg = h->fenc->planes8[comp_id]; + pel8_t* pRec = h->fdec->planes8[comp_id]; // xavs2_log(h, XAVS2_LOG_INFO, "pOrg : %p pRec : %p\n",pOrg,pRec); - pel_t* pOrgPel = pOrg; - pel_t* pRecPel = pRec; + pel8_t* pOrgPel = pOrg; + pel8_t* pRecPel = pRec; for (j = 0; j <= uiHeight - uiWinHeight; j++) { for (i = 0; i <= uiWidth - uiWinWidth; i++) { @@ -191,6 +221,63 @@ double ssim_calculate_plane(xavs2_t *h, int comp_id) // xavs2_log(h, XAVS2_LOG_INFO,"ssim: %7.4f \n ", dMSSIM / (double)uiNumWin); return dMSSIM / (double)uiNumWin; + } else { + pel10_t* pOrg = h->fenc->planes10[comp_id]; + pel10_t* pRec = h->fdec->planes10[comp_id]; + // xavs2_log(h, XAVS2_LOG_INFO, "pOrg : %p pRec : %p\n",pOrg,pRec); + + pel10_t* pOrgPel = pOrg; + pel10_t* pRecPel = pRec; + + for (j = 0; j <= uiHeight - uiWinHeight; j++) { + for (i = 0; i <= uiWidth - uiWinWidth; i++) { + dLocMeanRef = 0; + dLocMeanRec = 0; + dLocVarRef = 0; + dLocVarRec = 0; + dLocCovar = 0; + pOrgPel = pOrg + i + iStride1*j; + pRecPel = pRec + i + iStride2*j; + // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[0] : %d pRecPel[0] : %d\n",pOrgPel[0],pRecPel[0]); + // xavs2_log(h, XAVS2_LOG_INFO, "uiWinWidth : %d uiWinHeight : %d\n",uiWinWidth,uiWinHeight); + + for (y = 0; y < uiWinHeight; y++) { + for (x = 0; x < uiWinWidth; x++) { + // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[%d] : %d pRecPel[%d] : %d\n",x,pOrgPel[x],x,pRecPel[x]); + + dLocMeanRef += pOrgPel[x]; + dLocMeanRec += pRecPel[x]; + dLocVarRef += pOrgPel[x] * pOrgPel[x]; + dLocVarRec += pRecPel[x] * pRecPel[x]; + dLocCovar += pOrgPel[x] * pRecPel[x]; + + } + pOrgPel += iStride1; + pRecPel += iStride2; + } + + dLocMeanRef /= iWinPixel; + dLocMeanRec /= iWinPixel; + // xavs2_log(h, XAVS2_LOG_INFO, "dLocMeanRef : %7.4f dLocMeanRec : %7.4f \n",dLocMeanRef,dLocMeanRec); + + dLocVarRef = (dLocVarRef - dLocMeanRef * dLocMeanRef * iWinPixel) / iWinPixel; + dLocVarRec = (dLocVarRec - dLocMeanRec * dLocMeanRec * iWinPixel) / iWinPixel; + dLocCovar = (dLocCovar - dLocMeanRef * dLocMeanRec * iWinPixel) / iWinPixel; + + Num1 = 2.0 * dLocMeanRef * dLocMeanRec + C1; + Num2 = 2.0 * dLocCovar + C2; + Den1 = dLocMeanRef * dLocMeanRef + dLocMeanRec * dLocMeanRec + C1; + Den2 = dLocVarRef + dLocVarRec + C2; + + dLocSSIM = (Num1 * Num2) / (Den1 * Den2); + + dMSSIM += dLocSSIM; + } + } + + // xavs2_log(h, XAVS2_LOG_INFO,"ssim: %7.4f \n ", dMSSIM / (double)uiNumWin); + return dMSSIM / (double)uiNumWin; + } } /* --------------------------------------------------------------------------- @@ -405,8 +492,13 @@ void encoder_show_head_info(xavs2_param_t *param) xavs2_log(NULL, XAVS2_LOG_DEBUG, " Total Frames : %d \n", param->num_frames); /* basic parameters */ xavs2_log(NULL, XAVS2_LOG_INFO, "--------------------------------------------------------------------------------\n"); + if (param->input_sample_bit_depth == 8) { + xavs2_log(NULL, XAVS2_LOG_INFO, " Profile & Level : 0x%02X-0x%02X, BitDepth: %d/%d, size(pel): %d \n", + param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel8_t)); + } else { xavs2_log(NULL, XAVS2_LOG_INFO, " Profile & Level : 0x%02X-0x%02X, BitDepth: %d/%d, size(pel): %d \n", - param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel_t)); + param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel10_t)); + } xavs2_log(NULL, XAVS2_LOG_INFO, " Video Property : %dx%d, %.3f Hz (FrameRateCode: %d)\n", param->org_width, param->org_height, param->frame_rate, param->frame_rate_code); diff --git a/source/encoder/header.c b/source/encoder/header.c index 0802c4a..7a84ff7 100644 --- a/source/encoder/header.c +++ b/source/encoder/header.c @@ -52,7 +52,7 @@ */ static ALWAYS_INLINE int is_valid_qp(xavs2_t *h, int i_qp) { - int max_qp = MAX_QP; + int max_qp = MAX_QP + (h->param->sample_bit_depth - 8) * 8; UNUSED_PARAMETER(h); return i_qp >= 0 && i_qp <= max_qp; } diff --git a/source/encoder/md_inter.c b/source/encoder/md_inter.c index 8b652f2..7613150 100644 --- a/source/encoder/md_inter.c +++ b/source/encoder/md_inter.c @@ -152,7 +152,7 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo } } - /* 相邻块不存在双向预测块时,双向Skip/Direct模式的填充 */ + /* 鐩搁偦鍧椾笉瀛樺湪鍙屽悜棰勬祴鍧楁椂锛屽弻鍚慡kip/Direct妯″紡鐨勫~鍏 */ if (bid_flag == 0 && fwd_flag != 0 && bw_flag != 0) { p_cumode->skip_mv_2nd[DS_B_BID] = p_cumode->skip_mv_2nd[DS_B_BWD]; p_cumode->skip_mv_1st[DS_B_BID] = p_cumode->skip_mv_1st[DS_B_FWD]; @@ -160,16 +160,16 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo p_cumode->skip_ref_1st[DS_B_BID] = B_FWD; p_cumode->skip_ref_2nd[DS_B_BID] = B_BWD; - /* 相邻块不存在对称预测块时,对称Skip/Direct模式的填充 */ + /* 鐩搁偦鍧椾笉瀛樺湪瀵圭О棰勬祴鍧楁椂锛屽绉癝kip/Direct妯″紡鐨勫~鍏 */ if (sym_flag == 0) { - if (bid_flag > 1) { /* 若存在双向预测块,则使用双向预测块生成 */ + if (bid_flag > 1) { /* 鑻ュ瓨鍦ㄥ弻鍚戦娴嬪潡锛屽垯浣跨敤鍙屽悜棰勬祴鍧楃敓鎴 */ p_cumode->skip_mv_2nd[DS_B_SYM] = p_neighbors[bid2].mv[1]; p_cumode->skip_mv_1st[DS_B_SYM] = p_neighbors[bid2].mv[0]; - } else if (bw_flag != 0) { /* 若存在后向预测块,则使用后向预测块生成 */ + } else if (bw_flag != 0) { /* 鑻ュ瓨鍦ㄥ悗鍚戦娴嬪潡锛屽垯浣跨敤鍚庡悜棰勬祴鍧楃敓鎴 */ p_cumode->skip_mv_2nd[DS_B_SYM] = p_cumode->skip_mv_2nd[DS_B_BWD]; p_cumode->skip_mv_1st[DS_B_SYM].x = -p_cumode->skip_mv_2nd[DS_B_BWD].x; p_cumode->skip_mv_1st[DS_B_SYM].y = -p_cumode->skip_mv_2nd[DS_B_BWD].y; - } else if (fwd_flag != 0) { /* 若存在前向预测块,则使用前向预测块生成 */ + } else if (fwd_flag != 0) { /* 鑻ュ瓨鍦ㄥ墠鍚戦娴嬪潡锛屽垯浣跨敤鍓嶅悜棰勬祴鍧楃敓鎴 */ p_cumode->skip_mv_2nd[DS_B_SYM].x = -p_cumode->skip_mv_1st[DS_B_FWD].x; p_cumode->skip_mv_2nd[DS_B_SYM].y = -p_cumode->skip_mv_1st[DS_B_FWD].y; p_cumode->skip_mv_1st[DS_B_SYM] = p_cumode->skip_mv_1st[DS_B_FWD]; @@ -177,16 +177,16 @@ void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbo } p_cumode->skip_ref_1st[DS_B_SYM] = B_FWD; p_cumode->skip_ref_2nd[DS_B_SYM] = B_BWD; - /* 后向预测块不存在时后向Skip/Direct模式的填充 */ - if (bw_flag == 0 && bid_flag > 1) { /* 如果存在双向预测块,则使用双向预测块逆序的最后一个元素 */ + /* 鍚庡悜棰勬祴鍧椾笉瀛樺湪鏃跺悗鍚慡kip/Direct妯″紡鐨勫~鍏 */ + if (bw_flag == 0 && bid_flag > 1) { /* 濡傛灉瀛樺湪鍙屽悜棰勬祴鍧楋紝鍒欎娇鐢ㄥ弻鍚戦娴嬪潡閫嗗簭鐨勬渶鍚庝竴涓厓绱 */ p_cumode->skip_mv_2nd[DS_B_BWD] = p_neighbors[bid2].mv[1]; - } else if (bw_flag == 0 && bid_flag != 0) { /* 只有一个双向预测块时,使用双向列表的后向 */ + } else if (bw_flag == 0 && bid_flag != 0) { /* 鍙湁涓涓弻鍚戦娴嬪潡鏃讹紝浣跨敤鍙屽悜鍒楄〃鐨勫悗鍚 */ p_cumode->skip_mv_2nd[DS_B_BWD] = p_cumode->skip_mv_2nd[DS_B_BID]; } p_cumode->skip_ref_1st[DS_B_BWD] = INVALID_REF; p_cumode->skip_ref_2nd[DS_B_BWD] = B_BWD; - /* 前向预测块不存在时前向Skip/Direct模式的填充,类似后向Skip/Direct模式 */ + /* 鍓嶅悜棰勬祴鍧椾笉瀛樺湪鏃跺墠鍚慡kip/Direct妯″紡鐨勫~鍏咃紝绫讳技鍚庡悜Skip/Direct妯″紡 */ if (fwd_flag == 0 && bid_flag > 1) { p_cumode->skip_mv_1st[DS_B_FWD] = p_neighbors[bid2].mv[0]; } else if (fwd_flag == 0 && bid_flag != 0) { @@ -757,8 +757,8 @@ int get_mv_predictors_bskip(xavs2_t *h, cu_t *p_cu) col_mv_pos = (pic_block_y >> 4) * w_in_16x16 + (pic_block_x >> 4); col_blk_ref = col_ref[col_mv_pos]; if (col_blk_ref == INVALID_REF) { - ///! 9.5.8.4.3 运动矢量导出方法2:如果编码 单元子类型为 B_Skip_Bi,且时域PU的参考索引为 INVALID_REF - get_mvp_default(h, p_neighbors, &mv_1st, 0, &cur_cb, B_FWD); // 这里传递的ref_idx影响p_me->pred_sad_space,但不被使用 + ///! 9.5.8.4.3 杩愬姩鐭㈤噺瀵煎嚭鏂规硶2锛氬鏋滅紪鐮 鍗曞厓瀛愮被鍨嬩负 B_Skip_Bi锛屼笖鏃跺煙PU鐨勫弬鑰冪储寮曚负 INVALID_REF + get_mvp_default(h, p_neighbors, &mv_1st, 0, &cur_cb, B_FWD); // 杩欓噷浼犻掔殑ref_idx褰卞搷p_me->pred_sad_space锛屼絾涓嶈浣跨敤 get_mvp_default(h, p_neighbors, &mv_2nd, 1, &cur_cb, B_BWD); } else { int TRp = h->fref[B_BWD]->ref_dpoc[col_blk_ref]; @@ -879,7 +879,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m int mv_mempos_x; int mv_mempos_y; mv_t mv; - int b_mv_valid; // MV是否有效:大小取值是否在标准规定的有效范围内 + int b_mv_valid; // MV鏄惁鏈夋晥锛氬ぇ灏忓彇鍊兼槸鍚﹀湪鏍囧噯瑙勫畾鐨勬湁鏁堣寖鍥村唴 int pu_idx_x = p_cb->x != 0; // PU index in CU int pu_idx_y = p_cb->y != 0; int pu_idx = (pu_idx_y << 1) + pu_idx_x; @@ -895,12 +895,16 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m int max_ref = h->i_ref; *fwd_cost = MAX_DISTORTION; - mv_mempos_x = (pix_x + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT; // 考虑到8x8块的非对称划分,需要做一个补偿再移位 + mv_mempos_x = (pix_x + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT; // 鑰冭檻鍒8x8鍧楃殑闈炲绉板垝鍒嗭紝闇瑕佸仛涓涓ˉ鍋垮啀绉讳綅 mv_mempos_y = (pix_y + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT; all_min_costs = &h->all_mincost[mv_mempos_y * width_in_4x4 + mv_mempos_x]; /* make p_fenc point to the start address of the current PU */ - p_me->p_fenc = h->lcu.p_fenc[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x; + if (h->param->input_sample_bit_depth == 8) { + p_me->p_fenc8 = h->lcu.p_fenc8[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x; + } else { + p_me->p_fenc10 = h->lcu.p_fenc10[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x; + } p_me->i_pixel = PART_INDEX(bsx, bsy); p_me->i_pix_x = pix_x; p_me->i_pix_y = pix_y; @@ -938,7 +942,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m get_mvp_default(h, p_neighbors, pred_mv, bwd_2nd, p_cb, ref_idx); } - // 需在 MVP 获取之后执行,两者都会设置 p_me 状态 + // 闇鍦 MVP 鑾峰彇涔嬪悗鎵ц锛屼袱鑰呴兘浼氳缃 p_me 鐘舵 p_me->i_ref_idx = (int16_t)ref_idx; if (h->param->me_method == XAVS2_ME_UMH) { fast_me_prepare_info(h, p_me, mode, ref_idx, pu_idx, all_min_costs[0]); @@ -949,11 +953,11 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m p_me->p_fref_1st = p_ref_frm; p_me->mvp.v = pred_mv->v; - /* 限制MVP的取值,如果MVP值过大,则不做ME */ + /* 闄愬埗MVP鐨勫彇鍊硷紝濡傛灉MVP鍊艰繃澶э紝鍒欎笉鍋歁E */ b_mv_valid = check_mv_range(h, pred_mv, ref_idx, pix_x, pix_y, bsx, bsy); b_mv_valid &= check_mvd(h, pred_mv->x, pred_mv->y); - /* 默认必须搜索的点位置 */ + /* 榛樿蹇呴』鎼滅储鐨勭偣浣嶇疆 */ i_mvc = 0; i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, p_me->mvp.x, p_me->mvp.y); i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, 0, 0); @@ -961,7 +965,7 @@ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_m if (b_mv_valid) { cost = xavs2_me_search(h, p_me, mvc, i_mvc); } else { - p_me->bmv = p_me->mvp; // MVP越界时,最优MV设置成和MVP一样大小 + p_me->bmv = p_me->mvp; // MVP瓒婄晫鏃讹紝鏈浼楳V璁剧疆鎴愬拰MVP涓鏍峰ぇ灏 cost = MAX_DISTORTION; } mv = p_me->bmv; @@ -1039,11 +1043,10 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, mv_t mvp, mv; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode]; - pel_t *buf_pixel_temp = p_enc->buf_pixel_temp; int pu_size_shift = p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT; dist_t cost, cost_bid; int m, n, i, j; - int b_mv_valid; // MV是否有效:大小取值是否在标准规定的有效范围内 + int b_mv_valid; // MV鏄惁鏈夋晥锛氬ぇ灏忓彇鍊兼槸鍚﹀湪鏍囧噯瑙勫畾鐨勬湁鏁堣寖鍥村唴 int pu_idx_x = p_cb->x != 0; // PU index in CU int pu_idx_y = p_cb->y != 0; int k = (pu_idx_y << 1) + pu_idx_x; @@ -1080,8 +1083,10 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, b_mv_valid &= check_mv_range_sym(h, &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); b_mv_valid &= check_mvd(h, mvp.x, mvp.y); // avoid mv-bits calculation error + if (h->param->input_sample_bit_depth == 8) { + pel8_t *buf_pixel_temp = p_enc->buf_pixel_temp8; if (b_mv_valid) { - cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &mv); + cost = xavs2_me_search_sym8(h, p_me, buf_pixel_temp, &mv); } else { cost = MAX_DISTORTION; } @@ -1091,7 +1096,7 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y); // avoid mv-bits calculation error b_mv_valid &= check_mvd(h, p_me->mvp2.x, p_me->mvp2.y); if (b_mv_valid) { - cost_bid = xavs2_me_search_bid(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc); + cost_bid = xavs2_me_search_bid8(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc); } else { cost_bid = MAX_DISTORTION; } @@ -1127,6 +1132,56 @@ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, *sym_mcost = cost; *bid_mcost = cost_bid; + } else { + pel10_t *buf_pixel_temp = p_enc->buf_pixel_temp10; + if (b_mv_valid) { + cost = xavs2_me_search_sym10(h, p_me, buf_pixel_temp, &mv); + } else { + cost = MAX_DISTORTION; + } + + b_mv_valid = check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy); + b_mv_valid &= check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy); + b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y); // avoid mv-bits calculation error + b_mv_valid &= check_mvd(h, p_me->mvp2.x, p_me->mvp2.y); + if (b_mv_valid) { + cost_bid = xavs2_me_search_bid10(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc); + } else { + cost_bid = MAX_DISTORTION; + } + + // store motion vectors + m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); + n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); + for (j = 0; j < n; j++) { + for (i = 0; i < m; i++) { + k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); + p_mode_mv[k].all_sym_mv [0] = mv; + p_mode_mv[k].all_dual_mv_1st[0] = fwd_mv; + p_mode_mv[k].all_dual_mv_2nd[0] = bwd_mv; + } + } + + if (!(check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy) && + check_mvd(h, (fwd_mv.x - p_me->mvp1.x), (fwd_mv.y - p_me->mvp1.y)))) { + cost_bid = MAX_DISTORTION; + } + + if (!(check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy) && + check_mvd(h, (bwd_mv.x - p_me->mvp2.x), (bwd_mv.y - p_me->mvp2.y)))) { + cost_bid = MAX_DISTORTION; + } + + if (!(check_mv_range_sym(h, &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd) && + check_mvd(h, (mv.x - mvp.x), (mv.y - mvp.y)))) { + cost = MAX_DISTORTION; + } + p_me->bmvcost[PDIR_SYM] = p_me->mvcost[PDIR_SYM]; + p_me->bmvcost[PDIR_BID] = p_me->mvcost[PDIR_BID]; + + *sym_mcost = cost; + *bid_mcost = cost_bid; + } } /* --------------------------------------------------------------------------- @@ -1139,7 +1194,6 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me mv_t fst_dual, snd_dual; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode]; - pel_t *buf_pixel_temp = p_enc->buf_pixel_temp; int pix_x = p_cu->i_pix_x + p_cb->x; int pix_y = p_cu->i_pix_y + p_cb->y; int pu_idx_x = p_cb->x != 0; // PU index @@ -1150,7 +1204,7 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me int ref_idx; dist_t cost; int distance_fwd, distance_bwd; - int b_mv_valid; // MV是否有效:大小取值是否在标准规定的有效范围内 + int b_mv_valid; // MV鏄惁鏈夋晥锛氬ぇ灏忓彇鍊兼槸鍚﹀湪鏍囧噯瑙勫畾鐨勬湁鏁堣寖鍥村唴 int m, n, i, j, k; int max_ref = h->i_ref; @@ -1182,8 +1236,10 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y)); b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y); b_mv_valid &= check_mvd(h, p_me->mvp.x, p_me->mvp.y); + if (h->param->input_sample_bit_depth == 8) { + pel8_t *buf_pixel_temp = p_enc->buf_pixel_temp8; if (b_mv_valid) { - cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &fst_dual); + cost = xavs2_me_search_sym8(h, p_me, buf_pixel_temp, &fst_dual); } else { cost = MAX_DISTORTION; } @@ -1215,6 +1271,42 @@ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me p_me->bmvcost[PDIR_DUAL] = p_me->mvcost[PDIR_SYM]; } } + } else { + pel10_t *buf_pixel_temp = p_enc->buf_pixel_temp10; + if (b_mv_valid) { + cost = xavs2_me_search_sym10(h, p_me, buf_pixel_temp, &fst_dual); + } else { + cost = MAX_DISTORTION; + } + + /* store motion vectors and reference frame (for motion vector prediction) */ + snd_dual.v = MAKEDWORD(scale_mv_skip ( fst_dual.x, distance_bwd, distance_fwd), + scale_mv_skip_y(h, fst_dual.y, distance_bwd, distance_fwd)); + + m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); + n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); + for (j = 0; j < n; j++) { + for (i = 0; i < m; i++) { + k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); + p_mode_mv[k].all_dual_mv_1st[ref_idx] = fst_dual; + p_mode_mv[k].all_dual_mv_2nd[ref_idx] = snd_dual; + } + } + + b_mv_valid &= check_mv_range_sym(h, &fst_dual, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); + b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y)); + if (!b_mv_valid) { + cost = MAX_DISTORTION; + } else { + cost += REF_COST(ref_idx); + if (cost < *dual_mcost) { + *dual_mcost = cost; + *dual_best_fst_ref = ref_idx; + *dual_best_snd_ref = !ref_idx; + p_me->bmvcost[PDIR_DUAL] = p_me->mvcost[PDIR_SYM]; + } + } + } } } diff --git a/source/encoder/md_intra.c b/source/encoder/md_intra.c index 08999e5..37753d9 100644 --- a/source/encoder/md_intra.c +++ b/source/encoder/md_intra.c @@ -62,14 +62,14 @@ uint32_t get_intra_neighbors(xavs2_t *h, int x_4x4, int y_4x4, int bsx, int bsy, const int lcu_mask = (1 << (h->i_lcu_level - 2)) - 1; int leftdown, topright; - /* 1. 检查相邻块是否属于同一个Slice */ + /* 1. 妫鏌ョ浉閭诲潡鏄惁灞炰簬鍚屼竴涓猄lice */ uint32_t b_LEFT = is_block_available(h, x_4x4, y_4x4, -1, 0, cur_slice_idx); uint32_t b_TOP = is_block_available(h, x_4x4, y_4x4, 0, -1, cur_slice_idx); uint32_t b_TOP_LEFT = is_block_available(h, x_4x4, y_4x4, -1, -1, cur_slice_idx); uint32_t b_TOP_RIGHT = is_block_available(h, x_4x4, y_4x4, (bsx >> 1) - 1, -1, cur_slice_idx); // (bsx >> MIN_PU_SIZE_IN_BIT << 1) uint32_t b_LEFT_DOWN = is_block_available(h, x_4x4, y_4x4, -1, (bsy >> 1) - 1, cur_slice_idx); // (bsy >> MIN_PU_SIZE_IN_BIT << 1) - /* 2. 检查相邻块是否在当前块之前重构 */ + /* 2. 妫鏌ョ浉閭诲潡鏄惁鍦ㄥ綋鍓嶅潡涔嬪墠閲嶆瀯 */ x_4x4 &= lcu_mask; y_4x4 &= lcu_mask; leftdown = h->tab_avail_DL[((y_4x4 + (bsy >> 2) - 1) << (h->i_lcu_level - B4X4_IN_BIT)) + (x_4x4)]; @@ -101,29 +101,29 @@ uint32_t get_intra_pu_avail(cu_t *p_cu, int block_x, int block_y, int bsx, int b avail = (avail & (~(1 << MD_I_LEFT_DOWN))) | (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT) << MD_I_LEFT_DOWN); } } else if (block_y == 0) { - avail = (cu_avail & (1 << MD_I_TOP)); // 上边界由CU的上边界决定;左下均不可用 - avail |= (1 << MD_I_LEFT); // 左边界均可用 - avail |= ((cu_avail >> MD_I_TOP) & 1) << MD_I_TOP_LEFT; // 左上由CU上边界可用性决定 - if (block_x + bsx < cu_size) { // 右上由CU上边界和右上边界决定 + avail = (cu_avail & (1 << MD_I_TOP)); // 涓婅竟鐣岀敱CU鐨勪笂杈圭晫鍐冲畾锛涘乏涓嬪潎涓嶅彲鐢 + avail |= (1 << MD_I_LEFT); // 宸﹁竟鐣屽潎鍙敤 + avail |= ((cu_avail >> MD_I_TOP) & 1) << MD_I_TOP_LEFT; // 宸︿笂鐢盋U涓婅竟鐣屽彲鐢ㄦу喅瀹 + if (block_x + bsx < cu_size) { // 鍙充笂鐢盋U涓婅竟鐣屽拰鍙充笂杈圭晫鍐冲畾 avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_TOP)) << MD_I_TOP_RIGHT; } else { avail |= cu_avail & (1 << MD_I_TOP_RIGHT); } } else if (block_x == 0) { - avail = (cu_avail & (1 << MD_I_LEFT)); // 左边界由CU的左边界决定 - avail |= (1 << MD_I_TOP); // 上边界均可用 - avail |= ((cu_avail >> MD_I_LEFT) & 1) << MD_I_TOP_LEFT; // 左上由CU上边界可用性决定 - if (bsx < cu_size && bsy < cu_size) { // 右上 + avail = (cu_avail & (1 << MD_I_LEFT)); // 宸﹁竟鐣岀敱CU鐨勫乏杈圭晫鍐冲畾 + avail |= (1 << MD_I_TOP); // 涓婅竟鐣屽潎鍙敤 + avail |= ((cu_avail >> MD_I_LEFT) & 1) << MD_I_TOP_LEFT; // 宸︿笂鐢盋U涓婅竟鐣屽彲鐢ㄦу喅瀹 + if (bsx < cu_size && bsy < cu_size) { // 鍙充笂 avail |= 1 << MD_I_TOP_RIGHT; } - // 左下 + // 宸︿笅 if (block_y + bsy < cu_size) { avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT)) << MD_I_LEFT_DOWN; } else { avail |= cu_avail & (1 << MD_I_LEFT_DOWN); } } else { - // 右上、左下不可用 + // 鍙充笂銆佸乏涓嬩笉鍙敤 avail = (1 << MD_I_LEFT) | (1 << MD_I_TOP) | (1 << MD_I_TOP_LEFT); } @@ -134,20 +134,20 @@ uint32_t get_intra_pu_avail(cu_t *p_cu, int block_x, int block_y, int bsx, int b * fill reference samples for luma component */ static INLINE -void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP, +void fill_ref_samples_luma8(xavs2_t *h, cu_t *p_cu, pel8_t *EP, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy) { int pos_x = (img_x - h->lcu.i_pix_x - 1); int pos_y = (img_y - h->lcu.i_pix_y - 1); - pel_t *pTL = h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x; + pel8_t *pTL = h->lcu.p_fdec8[0] + pos_y * FDEC_STRIDE + pos_x; int xy = (((pos_y + 1) != 0) << 1) + ((pos_x + 1) != 0); uint32_t avail; - /* 1, 检查参考边界有效性 */ + /* 1, 妫鏌ュ弬鑰冭竟鐣屾湁鏁堟 */ if (img_x + 2 * bsx <= h->i_width && img_y + 2 * bsy <= h->i_height - && 0) { // TODO: 高档次下不匹配,仍采用原先默认模式 + && 0) { // TODO: 楂樻。娆′笅涓嶅尮閰嶏紝浠嶉噰鐢ㄥ師鍏堥粯璁ゆā寮 avail = get_intra_pu_avail(p_cu, block_x, block_y, bsx, bsy); } else { int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT); @@ -159,8 +159,38 @@ void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP, p_cu->block_avail = (uint8_t)avail; - /* 2, 完成参考边界像素的填充 */ - g_funcs.fill_edge_f[xy](pTL, FDEC_STRIDE, h->lcu.ctu_border[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy); + /* 2, 瀹屾垚鍙傝冭竟鐣屽儚绱犵殑濉厖 */ + g_funcs.fill_edge8_f[xy](h, pTL, FDEC_STRIDE, h->lcu.ctu_border8[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy); +} + +static INLINE +void fill_ref_samples_luma10(xavs2_t *h, cu_t *p_cu, pel10_t *EP, + int img_x, int img_y, + int block_x, int block_y, + int bsx, int bsy) +{ + int pos_x = (img_x - h->lcu.i_pix_x - 1); + int pos_y = (img_y - h->lcu.i_pix_y - 1); + pel10_t *pTL = h->lcu.p_fdec10[0] + pos_y * FDEC_STRIDE + pos_x; + int xy = (((pos_y + 1) != 0) << 1) + ((pos_x + 1) != 0); + uint32_t avail; + + /* 1, 妫鏌ュ弬鑰冭竟鐣屾湁鏁堟 */ + if (img_x + 2 * bsx <= h->i_width && img_y + 2 * bsy <= h->i_height + && 0) { // TODO: 楂樻。娆′笅涓嶅尮閰嶏紝浠嶉噰鐢ㄥ師鍏堥粯璁ゆā寮 + avail = get_intra_pu_avail(p_cu, block_x, block_y, bsx, bsy); + } else { + int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT); + int b8_x = img_x >> MIN_PU_SIZE_IN_BIT; + int b8_y = img_y >> MIN_PU_SIZE_IN_BIT; + + avail = get_intra_neighbors(h, b8_x, b8_y, bsx, bsy, cur_slice_idx); + } + + p_cu->block_avail = (uint8_t)avail; + + /* 2, 瀹屾垚鍙傝冭竟鐣屽儚绱犵殑濉厖 */ + g_funcs.fill_edge10_f[xy](h, pTL, FDEC_STRIDE, h->lcu.ctu_border10[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy); } /* --------------------------------------------------------------------------- @@ -169,18 +199,34 @@ void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP, * \param dst: aligned to 32-byte */ static INLINE -void xavs2_intra_prediction(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy) +void xavs2_intra_prediction8(xavs2_t *h, pel8_t *src, pel8_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy) +{ + //UNUSED_PARAMETER(h); + + if (dir_mode != DC_PRED) { + g_funcs.intraf8[dir_mode](h, src, dst, i_dst, dir_mode, bsx, bsy); + } else { + int b_top = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP); + int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT); + int mode_ex = ((b_top << 8) + b_left); + + g_funcs.intraf8[dir_mode](h, src, dst, i_dst, mode_ex, bsx, bsy); + } +} + +static INLINE +void xavs2_intra_prediction10(xavs2_t *h, pel10_t *src, pel10_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy) { - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); if (dir_mode != DC_PRED) { - g_funcs.intraf[dir_mode](src, dst, i_dst, dir_mode, bsx, bsy); + g_funcs.intraf10[dir_mode](h, src, dst, i_dst, dir_mode, bsx, bsy); } else { int b_top = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP); int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT); int mode_ex = ((b_top << 8) + b_left); - g_funcs.intraf[dir_mode](src, dst, i_dst, mode_ex, bsx, bsy); + g_funcs.intraf10[dir_mode](h, src, dst, i_dst, mode_ex, bsx, bsy); } } @@ -212,40 +258,78 @@ void update_candidate_list(int mode, rdcost_t cost, int max_num, intra_candidate /* --------------------------------------------------------------------------- * used for generating intra luma prediction samples */ -#define PREDICT_ADD_LUMA(MODE_IDX) \ + +#define PREDICT_ADD_LUMA8(MODE_IDX) \ {\ - pel_t *p_pred = p_enc->intra_pred[MODE_IDX];\ + pel8_t *p_pred = p_enc->intra8_pred[MODE_IDX];\ int mode_bits = (mpm[0] == (MODE_IDX) || mpm[1] == (MODE_IDX)) ? 2 : 6;\ rdcost_t cost = h->f_lambda_mode * mode_bits; \ \ - xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, MODE_IDX,\ + xavs2_intra_prediction8(h, edge_pixels, p_pred, block_w, MODE_IDX,\ p_cu->block_avail, block_w, block_h);\ - cost += intra_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\ + cost += intra8_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\ + update_candidate_list(MODE_IDX, cost, INTRA_MODE_NUM_FOR_RDO, p_candidates);\ +} + +#define PREDICT_ADD_LUMA10(MODE_IDX) \ +{\ + pel10_t *p_pred = p_enc->intra10_pred[MODE_IDX];\ + int mode_bits = (mpm[0] == (MODE_IDX) || mpm[1] == (MODE_IDX)) ? 2 : 6;\ + rdcost_t cost = h->f_lambda_mode * mode_bits; \ + \ + xavs2_intra_prediction10(h, edge_pixels, p_pred, block_w, MODE_IDX,\ + p_cu->block_avail, block_w, block_h);\ + cost += intra10_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\ update_candidate_list(MODE_IDX, cost, INTRA_MODE_NUM_FOR_RDO, p_candidates);\ } /* --------------------------------------------------------------------------- * return numbers for RDO and candidate list by scanning all the intra modes */ -int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +int rdo_get_pred_intra_luma8(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h) +{ + pixel8_cmp_t intra8_cmp = g_funcs.pixf.intra8_cmp[PART_INDEX(block_w, block_h)]; + cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); + pel8_t *edge_pixels = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1]; + int mode; + int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; + int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; + + /* get edge samples for intra prediction */ + fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + + UNUSED_PARAMETER(blockidx); + + /* loop over all intra predication modes */ + for (mode = 0; mode < NUM_INTRA_MODE; mode++) { + PREDICT_ADD_LUMA8(mode); + } + + p_cu->feature.intra_had_cost = p_candidates[0].cost; + return h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)]; +} + +int rdo_get_pred_intra_luma10(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { - pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)]; + pixel10_cmp_t intra10_cmp = g_funcs.pixf.intra10_cmp[PART_INDEX(block_w, block_h)]; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); - pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; + pel10_t *edge_pixels = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1]; int mode; int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ - fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(blockidx); /* loop over all intra predication modes */ for (mode = 0; mode < NUM_INTRA_MODE; mode++) { - PREDICT_ADD_LUMA(mode); + PREDICT_ADD_LUMA10(mode); } p_cu->feature.intra_had_cost = p_candidates[0].cost; @@ -255,16 +339,16 @@ int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candida /* --------------------------------------------------------------------------- * return numbers for RDO and candidate list by rough scanning */ -int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +int rdo_get_pred_intra_luma8_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { int visited[NUM_INTRA_MODE] = { 0 }; /* 0: not visited yet * 1: visited in the first phase * 2: visited in final_mode */ - pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)]; + pixel8_cmp_t intra8_cmp = g_funcs.pixf.intra8_cmp[PART_INDEX(block_w, block_h)]; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); - pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; + pel8_t *edge_pixels = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1]; int mode, i, j; int num_angle = 0; int num_for_rdo; @@ -273,23 +357,23 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ - fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(blockidx); - /* 1, 遍历基础模式, - * (1.1) 几个关键的角度 */ + /* 1, 閬嶅巻鍩虹妯″紡锛 + * (1.1) 鍑犱釜鍏抽敭鐨勮搴 */ for (mode = 0; mode < 3; mode++) { - PREDICT_ADD_LUMA(mode); + PREDICT_ADD_LUMA8(mode); visited[mode] = 1; } - /* (1.2) 角度预测模式 */ + /* (1.2) 瑙掑害棰勬祴妯″紡 */ for (mode = 4; mode < NUM_INTRA_MODE; mode += 4) { - PREDICT_ADD_LUMA(mode); + PREDICT_ADD_LUMA8(mode); visited[mode] = 1; } - /* 2, 遍历N个最优的模式的距离为二的模式,如果较优则放到CandModeList中 */ + /* 2, 閬嶅巻N涓渶浼樼殑妯″紡鐨勮窛绂讳负浜岀殑妯″紡锛屽鏋滆緝浼樺垯鏀惧埌CandModeList涓 */ num_to_add = h->num_intra_rmd_dist2; for (i = 0; i < num_to_add; i++) { mode = p_candidates[i].mode; @@ -299,18 +383,18 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can if (mode > 3 && !visited[mode - 2]) { j = mode - 2; - PREDICT_ADD_LUMA(j); + PREDICT_ADD_LUMA8(j); visited[j] = 1; } if (mode < NUM_INTRA_MODE - 2 && !visited[mode + 2]) { j = mode + 2; - PREDICT_ADD_LUMA(j); + PREDICT_ADD_LUMA8(j); visited[j] = 1; } } - /* 3, 把以上得到的最佳的两个模式的距离为一的模式放在CandModeList中 */ + /* 3, 鎶婁互涓婂緱鍒扮殑鏈浣崇殑涓や釜妯″紡鐨勮窛绂讳负涓鐨勬ā寮忔斁鍦–andModeList涓 */ num_to_add = h->num_intra_rmd_dist1; for (i = 0, num_angle = 0; num_angle < num_to_add && i < INTRA_MODE_NUM_FOR_RDO; i++) { mode = p_candidates[i].mode; @@ -320,42 +404,179 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can if (mode > 3 && !visited[mode - 1]) { j = mode - 1; - PREDICT_ADD_LUMA(j); + PREDICT_ADD_LUMA8(j); visited[j] = 1; num_angle++; } if (mode < NUM_INTRA_MODE - 1 && !visited[mode + 1]) { j = mode + 1; - PREDICT_ADD_LUMA(j); + PREDICT_ADD_LUMA8(j); visited[j] = 1; num_angle++; } } - /* 4, 查找最优列表中是否有MPMs,若没有,则加入,若有则不用加入 */ + /* 4, 鏌ユ壘鏈浼樺垪琛ㄤ腑鏄惁鏈塎PMs锛岃嫢娌℃湁锛屽垯鍔犲叆锛岃嫢鏈夊垯涓嶇敤鍔犲叆 */ if (!visited[mpm[0]]) { mode = mpm[0]; - PREDICT_ADD_LUMA(mode); + PREDICT_ADD_LUMA8(mode); visited[mode] = 1; } if (!visited[mpm[1]]) { mode = mpm[1]; - PREDICT_ADD_LUMA(mode); + PREDICT_ADD_LUMA8(mode); visited[mode] = 1; } num_for_rdo = h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)]; - /* 若当前局部最优的两个模式是MPM之一,则减少RDO模式数量 */ + /* 鑻ュ綋鍓嶅眬閮ㄦ渶浼樼殑涓や釜妯″紡鏄疢PM涔嬩竴锛屽垯鍑忓皯RDO妯″紡鏁伴噺 */ if (p_candidates[0].mode == mpm[0] || p_candidates[0].mode == mpm[1] || p_candidates[1].mode == mpm[0] || p_candidates[1].mode == mpm[1]) { num_for_rdo = XAVS2_MIN(num_for_rdo, 3); return num_for_rdo; } - /* 从M个最优模式中选定最终参加RDO的模式,即去重 */ + /* 浠嶮涓渶浼樻ā寮忎腑閫夊畾鏈缁堝弬鍔燫DO鐨勬ā寮忥紝鍗冲幓閲 */ + visited[p_candidates[0].mode] = 2; + visited[p_candidates[1].mode] = 2; + + for (i = 2, j = 2; i < INTRA_MODE_NUM_FOR_RDO && j < num_for_rdo; i++) { + mode = p_candidates[i].mode; + if (!visited[mode]) { + continue; + } + if (mode <= 2) { + p_candidates[j++].mode = mode; + visited[mode] = 2; + } else if (mode == 3) { + if (visited[4] == 1) { + p_candidates[j++].mode = 3; + visited[3] = 2; + } + } else if (mode == 32) { + if (visited[31] == 1) { + p_candidates[j++].mode = 32; + visited[32] = 2; + } + } else { + if (visited[mode - 1] == 1 && visited[mode + 1] == 1) { + p_candidates[j++].mode = mode; + visited[mode] = 2; + } + } + if (visited[0] == 2 && visited[1] == 2 && visited[2] == 2) { + break; + } + } + + p_cu->feature.intra_had_cost = p_candidates[0].cost; + return XAVS2_MIN(num_for_rdo, j); +} + +int rdo_get_pred_intra_luma10_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h) +{ + int visited[NUM_INTRA_MODE] = { 0 }; /* 0: not visited yet + * 1: visited in the first phase + * 2: visited in final_mode */ + pixel10_cmp_t intra10_cmp = g_funcs.pixf.intra10_cmp[PART_INDEX(block_w, block_h)]; + cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); + pel10_t *edge_pixels = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1]; + int mode, i, j; + int num_angle = 0; + int num_for_rdo; + int num_to_add; + int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; + int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; + + /* get edge samples for intra prediction */ + fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + + UNUSED_PARAMETER(blockidx); + + /* 1, 閬嶅巻鍩虹妯″紡锛 + * (1.1) 鍑犱釜鍏抽敭鐨勮搴 */ + for (mode = 0; mode < 3; mode++) { + PREDICT_ADD_LUMA10(mode); + visited[mode] = 1; + } + /* (1.2) 瑙掑害棰勬祴妯″紡 */ + for (mode = 4; mode < NUM_INTRA_MODE; mode += 4) { + PREDICT_ADD_LUMA10(mode); + visited[mode] = 1; + } + + /* 2, 閬嶅巻N涓渶浼樼殑妯″紡鐨勮窛绂讳负浜岀殑妯″紡锛屽鏋滆緝浼樺垯鏀惧埌CandModeList涓 */ + num_to_add = h->num_intra_rmd_dist2; + for (i = 0; i < num_to_add; i++) { + mode = p_candidates[i].mode; + if (mode <= 2) { + continue; + } + + if (mode > 3 && !visited[mode - 2]) { + j = mode - 2; + PREDICT_ADD_LUMA10(j); + visited[j] = 1; + } + + if (mode < NUM_INTRA_MODE - 2 && !visited[mode + 2]) { + j = mode + 2; + PREDICT_ADD_LUMA10(j); + visited[j] = 1; + } + } + + /* 3, 鎶婁互涓婂緱鍒扮殑鏈浣崇殑涓や釜妯″紡鐨勮窛绂讳负涓鐨勬ā寮忔斁鍦–andModeList涓 */ + num_to_add = h->num_intra_rmd_dist1; + for (i = 0, num_angle = 0; num_angle < num_to_add && i < INTRA_MODE_NUM_FOR_RDO; i++) { + mode = p_candidates[i].mode; + if (mode <= 2) { + continue; + } + + if (mode > 3 && !visited[mode - 1]) { + j = mode - 1; + PREDICT_ADD_LUMA10(j); + visited[j] = 1; + num_angle++; + } + + if (mode < NUM_INTRA_MODE - 1 && !visited[mode + 1]) { + j = mode + 1; + PREDICT_ADD_LUMA10(j); + visited[j] = 1; + num_angle++; + } + } + + /* 4, 鏌ユ壘鏈浼樺垪琛ㄤ腑鏄惁鏈塎PMs锛岃嫢娌℃湁锛屽垯鍔犲叆锛岃嫢鏈夊垯涓嶇敤鍔犲叆 */ + if (!visited[mpm[0]]) { + mode = mpm[0]; + PREDICT_ADD_LUMA10(mode); + visited[mode] = 1; + } + + if (!visited[mpm[1]]) { + mode = mpm[1]; + PREDICT_ADD_LUMA10(mode); + visited[mode] = 1; + } + + num_for_rdo = h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)]; + + /* 鑻ュ綋鍓嶅眬閮ㄦ渶浼樼殑涓や釜妯″紡鏄疢PM涔嬩竴锛屽垯鍑忓皯RDO妯″紡鏁伴噺 */ + if (p_candidates[0].mode == mpm[0] || p_candidates[0].mode == mpm[1] || + p_candidates[1].mode == mpm[0] || p_candidates[1].mode == mpm[1]) { + num_for_rdo = XAVS2_MIN(num_for_rdo, 3); + return num_for_rdo; + } + + /* 浠嶮涓渶浼樻ā寮忎腑閫夊畾鏈缁堝弬鍔燫DO鐨勬ā寮忥紝鍗冲幓閲 */ visited[p_candidates[0].mode] = 2; visited[p_candidates[1].mode] = 2; @@ -396,24 +617,48 @@ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_can /* --------------------------------------------------------------------------- * return the best intra prediction mode from the 1st run */ -int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, - pel_t *p_fenc, int mpm[], int blockidx, +int rdo_get_pred_intra_luma8_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel8_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); int best_intra_mode = p_cu->cu_info.real_intra_modes[blockidx]; - pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; - pel_t *p_pred = p_enc->intra_pred[best_intra_mode]; + pel8_t *edge_pixels = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 2) - 1]; + pel8_t *p_pred = p_enc->intra8_pred[best_intra_mode]; int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ - fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + fill_ref_samples_luma8(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(p_fenc); UNUSED_PARAMETER(mpm); - xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h); + xavs2_intra_prediction8(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h); + p_candidates[0].mode = best_intra_mode; + p_candidates[0].cost = 0; + + return 1; +} + +int rdo_get_pred_intra_luma10_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, + pel10_t *p_fenc, int mpm[], int blockidx, + int block_x, int block_y, int block_w, int block_h) +{ + cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); + int best_intra_mode = p_cu->cu_info.real_intra_modes[blockidx]; + pel10_t *edge_pixels = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 2) - 1]; + pel10_t *p_pred = p_enc->intra10_pred[best_intra_mode]; + int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; + int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; + + /* get edge samples for intra prediction */ + fill_ref_samples_luma10(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); + + UNUSED_PARAMETER(p_fenc); + UNUSED_PARAMETER(mpm); + + xavs2_intra_prediction10(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h); p_candidates[0].mode = best_intra_mode; p_candidates[0].cost = 0; @@ -430,46 +675,113 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_ intra_candidate_t *p_candidate_list) { cu_parallel_t *p_enc = cu_get_enc_context(h, i_level + 1); - pel_t *p_fenc_u = h->lcu.p_fenc[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c; - pel_t *p_fenc_v = h->lcu.p_fenc[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_fenc_u = h->lcu.p_fenc8[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c; + pel8_t *p_fenc_v = h->lcu.p_fenc8[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c; + int blksize = 1 << i_level; + pixel8_cmp_t intra_chroma_cost = g_funcs.pixf.intra8_cmp[PART_INDEX(blksize, blksize)]; + int num_for_rdo = 0; + + int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode + pel8_t *EP_u = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 1) - 1]; + pel8_t *EP_v = EP_u + (MAX_CU_SIZE << 2); + int xy = p_cu->in_lcu_edge; + + /* 璁$畻U銆乂鍒嗛噺鐨勫乏涓婅鍍忕礌鐐圭殑浣嶇疆 */ + pel8_t *pTL_u = h->lcu.p_fdec8[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + pel8_t *pTL_v = h->lcu.p_fdec8[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + int offset = (FREC_CSTRIDE >> 1); + int m; + + /* 妫鏌ヨ竟鐣屾湁鏁堟 */ + uint32_t avail = p_cu->intra_avail; + + /* 璁$畻姣忎釜妯″紡鍙峰搴旂殑棰勬祴妯″紡 */ + LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; + + /* 2.1, 鑾峰彇鍙傝冭竟鐣屽儚绱 */ + g_funcs.fill_edge8_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border8[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize); + g_funcs.fill_edge8_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border8[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize); + + for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { + p_candidate_list[m].mode = DM_PRED_C; + p_candidate_list[m].cost = MAX_COST; + } + + /* 2.2, 鎵ц棰勬祴 */ + for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { + pel8_t *p_pred_u = p_enc->intra8_pred_c[m]; + pel8_t *p_pred_v = p_enc->intra8_pred_c[m] + offset; + rdcost_t est_cost; + + xavs2_intra_prediction8(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); + xavs2_intra_prediction8(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); + + est_cost = intra_chroma_cost(p_fenc_u, FENC_STRIDE, p_pred_u, FREC_CSTRIDE); + est_cost += intra_chroma_cost(p_fenc_v, FENC_STRIDE, p_pred_v, FREC_CSTRIDE); + + update_candidate_list(m, est_cost, NUM_INTRA_MODE_CHROMA, p_candidate_list); + } + + if (h->i_type != SLICE_TYPE_I) { + num_for_rdo = NUM_INTRA_C_FULL_RD; + if (i_level == 6) { + num_for_rdo -= 2; + } else if (i_level == 5) { + num_for_rdo -= 1; + } + } else { + num_for_rdo = NUM_INTRA_MODE_CHROMA; + } + + if (p_candidate_list[0].mode == DM_PRED_C) { + num_for_rdo = 1; + } + + num_for_rdo = XAVS2_MIN(h->num_rdo_intra_chroma, num_for_rdo); + + return num_for_rdo; + } else { + pel10_t *p_fenc_u = h->lcu.p_fenc10[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c; + pel10_t *p_fenc_v = h->lcu.p_fenc10[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c; int blksize = 1 << i_level; - pixel_cmp_t intra_chroma_cost = g_funcs.pixf.intra_cmp[PART_INDEX(blksize, blksize)]; + pixel10_cmp_t intra_chroma_cost = g_funcs.pixf.intra10_cmp[PART_INDEX(blksize, blksize)]; int num_for_rdo = 0; int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode - pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1]; - pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2); + pel10_t *EP_u = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 1) - 1]; + pel10_t *EP_v = EP_u + (MAX_CU_SIZE << 2); int xy = p_cu->in_lcu_edge; - /* 计算U、V分量的左上角像素点的位置 */ - pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; - pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + /* 璁$畻U銆乂鍒嗛噺鐨勫乏涓婅鍍忕礌鐐圭殑浣嶇疆 */ + pel10_t *pTL_u = h->lcu.p_fdec10[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + pel10_t *pTL_v = h->lcu.p_fdec10[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; int offset = (FREC_CSTRIDE >> 1); int m; - /* 检查边界有效性 */ + /* 妫鏌ヨ竟鐣屾湁鏁堟 */ uint32_t avail = p_cu->intra_avail; - /* 计算每个模式号对应的预测模式 */ + /* 璁$畻姣忎釜妯″紡鍙峰搴旂殑棰勬祴妯″紡 */ LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; - /* 2.1, 获取参考边界像素 */ - g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize); - g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize); + /* 2.1, 鑾峰彇鍙傝冭竟鐣屽儚绱 */ + g_funcs.fill_edge10_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border10[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize); + g_funcs.fill_edge10_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border10[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize); for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { p_candidate_list[m].mode = DM_PRED_C; p_candidate_list[m].cost = MAX_COST; } - /* 2.2, 执行预测 */ + /* 2.2, 鎵ц棰勬祴 */ for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { - pel_t *p_pred_u = p_enc->intra_pred_c[m]; - pel_t *p_pred_v = p_enc->intra_pred_c[m] + offset; + pel10_t *p_pred_u = p_enc->intra10_pred_c[m]; + pel10_t *p_pred_v = p_enc->intra10_pred_c[m] + offset; rdcost_t est_cost; - xavs2_intra_prediction(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); - xavs2_intra_prediction(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); + xavs2_intra_prediction10(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); + xavs2_intra_prediction10(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); est_cost = intra_chroma_cost(p_fenc_u, FENC_STRIDE, p_pred_u, FREC_CSTRIDE); est_cost += intra_chroma_cost(p_fenc_v, FENC_STRIDE, p_pred_v, FREC_CSTRIDE); @@ -495,6 +807,7 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_ num_for_rdo = XAVS2_MIN(h->num_rdo_intra_chroma, num_for_rdo); return num_for_rdo; + } } //#endif @@ -504,39 +817,75 @@ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_ int rdo_get_pred_intra_chroma(xavs2_t *h, cu_t *p_cu, int i_level_c, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list) { + if (h->param->input_sample_bit_depth == 8) { int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode cu_parallel_t *p_enc = cu_get_enc_context(h, i_level_c + 1); - pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1]; - pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2); + pel8_t *EP_u = &p_enc->buf_edge_pixels8[(MAX_CU_SIZE << 1) - 1]; + pel8_t *EP_v = EP_u + (MAX_CU_SIZE << 2); int bsize = 1 << i_level_c; int xy = p_cu->in_lcu_edge; - /* 计算U、V分量的左上角像素点的位置 */ - pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; - pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + /* 璁$畻U銆乂鍒嗛噺鐨勫乏涓婅鍍忕礌鐐圭殑浣嶇疆 */ + pel8_t *pTL_u = h->lcu.p_fdec8[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + pel8_t *pTL_v = h->lcu.p_fdec8[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; int offset = (FREC_CSTRIDE >> 1); int m; - /* 检查边界有效性 */ + /* 妫鏌ヨ竟鐣屾湁鏁堟 */ uint32_t avail = p_cu->intra_avail; - /* 计算每个模式号对应的预测模式 */ + /* 璁$畻姣忎釜妯″紡鍙峰搴旂殑棰勬祴妯″紡 */ LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; - /* 2.1, 获取参考边界像素 */ - g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize); - g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize); + /* 2.1, 鑾峰彇鍙傝冭竟鐣屽儚绱 */ + g_funcs.fill_edge8_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border8[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize); + g_funcs.fill_edge8_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border8[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize); - /* 2.2, 执行预测 */ + /* 2.2, 鎵ц棰勬祴 */ for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { - xavs2_intra_prediction(h, EP_u, p_enc->intra_pred_c[m] + 0, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); - xavs2_intra_prediction(h, EP_v, p_enc->intra_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); + xavs2_intra_prediction8(h, EP_u, p_enc->intra8_pred_c[m] + 0, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); + xavs2_intra_prediction8(h, EP_v, p_enc->intra8_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); p_candidate_list[m].mode = m; p_candidate_list[m].cost = MAX_COST; } return NUM_INTRA_MODE_CHROMA; + } else { + int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode + cu_parallel_t *p_enc = cu_get_enc_context(h, i_level_c + 1); + pel10_t *EP_u = &p_enc->buf_edge_pixels10[(MAX_CU_SIZE << 1) - 1]; + pel10_t *EP_v = EP_u + (MAX_CU_SIZE << 2); + int bsize = 1 << i_level_c; + int xy = p_cu->in_lcu_edge; + + /* 璁$畻U銆乂鍒嗛噺鐨勫乏涓婅鍍忕礌鐐圭殑浣嶇疆 */ + pel10_t *pTL_u = h->lcu.p_fdec10[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + pel10_t *pTL_v = h->lcu.p_fdec10[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; + int offset = (FREC_CSTRIDE >> 1); + int m; + + /* 妫鏌ヨ竟鐣屾湁鏁堟 */ + uint32_t avail = p_cu->intra_avail; + + /* 璁$畻姣忎釜妯″紡鍙峰搴旂殑棰勬祴妯″紡 */ + LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; + + /* 2.1, 鑾峰彇鍙傝冭竟鐣屽儚绱 */ + g_funcs.fill_edge10_f[xy](h, pTL_u, FDEC_STRIDE, h->lcu.ctu_border10[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize); + g_funcs.fill_edge10_f[xy](h, pTL_v, FDEC_STRIDE, h->lcu.ctu_border10[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize); + + /* 2.2, 鎵ц棰勬祴 */ + for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { + xavs2_intra_prediction10(h, EP_u, p_enc->intra10_pred_c[m] + 0, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); + xavs2_intra_prediction10(h, EP_v, p_enc->intra10_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); + + p_candidate_list[m].mode = m; + p_candidate_list[m].cost = MAX_COST; + } + + return NUM_INTRA_MODE_CHROMA; + } } /* --------------------------------------------------------------------------- diff --git a/source/encoder/me.c b/source/encoder/me.c index db3dea7..f79ff86 100644 --- a/source/encoder/me.c +++ b/source/encoder/me.c @@ -125,7 +125,7 @@ static int8_t GRID[24][2] = { }; /* --------------------------------------------------------------------------- - * 用于分像素搜索的正方形搜索 */ + * 鐢ㄤ簬鍒嗗儚绱犳悳绱㈢殑姝f柟褰㈡悳绱 */ static const int8_t Spiral[9][2] = { { 0, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 0 }, @@ -162,11 +162,18 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- * early termination */ -#define EARLY_TERMINATION(pred_sad) \ +#define EARLY_TERMINATION8(pred_sad) \ if (bcost < (pred_sad) * beta3) {\ - goto umh_step_3;\ + goto umh_step8_3;\ } else if (bcost < (pred_sad) * beta2) {\ - goto umh_step_2;\ + goto umh_step8_2;\ + } + +#define EARLY_TERMINATION10(pred_sad) \ + if (bcost < (pred_sad) * beta3) {\ + goto umh_step10_3;\ + } else if (bcost < (pred_sad) * beta2) {\ + goto umh_step10_2;\ } @@ -178,34 +185,67 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- */ -#define CAL_COST_IPEL(mx, my) \ - g_funcs.pixf.sad[i_pixel](p_org, i_org,\ +#define CAL_COST_IPEL8(mx, my) \ + g_funcs.pixf.sad8[i_pixel](p_org, i_org,\ + p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my) + +#define CAL_COST_IPEL10(mx, my) \ + g_funcs.pixf.sad10[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my) /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL(mx, my) \ +#define ME_COST_IPEL8(mx, my) \ + if (CHECK_MV_RANGE(mx, my)) {\ + int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\ + p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ + COPY3_IF_LT(bcost, cost, bmx, mx, bmy, my);\ + } +#define ME_COST_IPEL10(mx, my) \ if (CHECK_MV_RANGE(mx, my)) {\ - int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ + int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY3_IF_LT(bcost, cost, bmx, mx, bmy, my);\ } /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL_DIR(mx, my, d) \ +#define ME_COST_IPEL8_DIR(mx, my, d) \ + if (CHECK_MV_RANGE(mx, my)) {\ + int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\ + p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ + COPY4_IF_LT(bcost, cost, bmx, mx, bmy, my, dir, d);\ + } + +#define ME_COST_IPEL10_DIR(mx, my, d) \ if (CHECK_MV_RANGE(mx, my)) {\ - int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ + int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY4_IF_LT(bcost, cost, bmx, mx, bmy, my, dir, d);\ } /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL_X3(m0x, m0y, m1x, m1y, m2x, m2y) \ +#define ME_COST_IPEL8_X3(m0x, m0y, m1x, m1y, m2x, m2y) \ +{\ + pel8_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad8_x3[i_pixel](p_org,\ + pix_base + (m0y) * i_fref + (m0x),\ + pix_base + (m1y) * i_fref + (m1x),\ + pix_base + (m2y) * i_fref + (m2x),\ + i_fref, costs);\ + costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ + costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ + costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ + COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\ + COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\ + COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\ +} + +#define ME_COST_IPEL10_X3(m0x, m0y, m1x, m1y, m2x, m2y) \ {\ - pel_t *pix_base = p_fref + omy * i_fref + omx;\ - g_funcs.pixf.sad_x3[i_pixel](p_org,\ + pel10_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad10_x3[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ @@ -220,10 +260,26 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \ +#define ME_COST_IPEL8_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \ +{\ + pel8_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad8_x3[i_pixel](p_org,\ + pix_base + (m0y) * i_fref + (m0x),\ + pix_base + (m1y) * i_fref + (m1x),\ + pix_base + (m2y) * i_fref + (m2x),\ + i_fref, costs);\ + costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ + costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ + costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ + COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\ + COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\ + COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\ +} + +#define ME_COST_IPEL10_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \ {\ - pel_t *pix_base = p_fref + omy * i_fref + omx;\ - g_funcs.pixf.sad_x3[i_pixel](p_org,\ + pel10_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad10_x3[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ @@ -238,11 +294,37 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ +#define ME_COST_IPEL8_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ +{\ + if (CHECK_MV_RANGE_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y)) { \ + pel8_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad8_x4[i_pixel](p_org,\ + pix_base + (m0y) * i_fref + (m0x),\ + pix_base + (m1y) * i_fref + (m1x),\ + pix_base + (m2y) * i_fref + (m2x),\ + pix_base + (m3y) * i_fref + (m3x),\ + i_fref, costs);\ + costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ + costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ + costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ + costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\ + COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\ + COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\ + COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\ + COPY3_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y));\ + } else { \ + ME_COST_IPEL8(m0x, m0y); \ + ME_COST_IPEL8(m1x, m1y); \ + ME_COST_IPEL8(m2x, m2y); \ + ME_COST_IPEL8(m3x, m3y); \ + } \ +} + +#define ME_COST_IPEL10_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ {\ if (CHECK_MV_RANGE_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y)) { \ - pel_t *pix_base = p_fref + omy * i_fref + omx;\ - g_funcs.pixf.sad_x4[i_pixel](p_org,\ + pel10_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad10_x4[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ @@ -257,19 +339,37 @@ static const int i_org = FENC_STRIDE; COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\ COPY3_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y));\ } else { \ - ME_COST_IPEL(m0x, m0y); \ - ME_COST_IPEL(m1x, m1y); \ - ME_COST_IPEL(m2x, m2y); \ - ME_COST_IPEL(m3x, m3y); \ + ME_COST_IPEL10(m0x, m0y); \ + ME_COST_IPEL10(m1x, m1y); \ + ME_COST_IPEL10(m2x, m2y); \ + ME_COST_IPEL10(m3x, m3y); \ } \ } /* --------------------------------------------------------------------------- */ -#define ME_COST_IPEL_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \ +#define ME_COST_IPEL8_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \ +{\ + pel8_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad8_x4[i_pixel](p_org,\ + pix_base + (m0y) * i_fref + (m0x),\ + pix_base + (m1y) * i_fref + (m1x),\ + pix_base + (m2y) * i_fref + (m2x),\ + pix_base + (m3y) * i_fref + (m3x), i_fref, costs);\ + costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ + costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ + costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ + costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\ + COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\ + COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\ + COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\ + COPY4_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y), dir, d3);\ +} + +#define ME_COST_IPEL10_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \ {\ - pel_t *pix_base = p_fref + omy * i_fref + omx;\ - g_funcs.pixf.sad_x4[i_pixel](p_org,\ + pel10_t *pix_base = p_fref + omy * i_fref + omx;\ + g_funcs.pixf.sad10_x4[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ @@ -286,18 +386,51 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- * for TZ */ -#define ME_COST_IPEL_DIR_DIST(mx, my, direction, dist) \ +#define ME_COST_IPEL8_DIR_DIST(mx, my, direction, dist) \ + if (CHECK_MV_RANGE(mx, my)) {\ + int cost = g_funcs.pixf.sad8[i_pixel](p_org, i_org,\ + p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ + COPY5_IF_LT(mv->bcost, cost, mv->bmx, mx, mv->bmy, my, mv->bdir, direction, mv->bdist, dist);\ + } + +#define ME_COST_IPEL10_DIR_DIST(mx, my, direction, dist) \ if (CHECK_MV_RANGE(mx, my)) {\ - int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ + int cost = g_funcs.pixf.sad10[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY5_IF_LT(mv->bcost, cost, mv->bmx, mx, mv->bmy, my, mv->bdir, direction, mv->bdist, dist);\ } /* --------------------------------------------------------------------------- * for TZ */ -#define ME_COST_IPEL_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ +#define ME_COST_IPEL8_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ +{\ + g_funcs.pixf.sad8_x4[i_pixel](p_org,\ + p_fref + (m0x) + (m0y) * i_fref,\ + p_fref + (m1x) + (m1y) * i_fref,\ + p_fref + (m2x) + (m2y) * i_fref,\ + p_fref + (m3x) + (m3y) * i_fref,\ + i_fref, costs);\ + (costs)[0] += MV_COST_IPEL(m0x, m0y);\ + (costs)[1] += MV_COST_IPEL(m1x, m1y);\ + (costs)[2] += MV_COST_IPEL(m2x, m2y);\ + (costs)[3] += MV_COST_IPEL(m3x, m3y);\ + if (CHECK_MV_RANGE(m0x,m0y)) {\ + COPY5_IF_LT(mv->bcost, costs[0], mv->bmx, m0x, mv->bmy, m0y, mv->bdir, p0, mv->bdist, d0);\ + }\ + if (CHECK_MV_RANGE(m1x,m1y)) {\ + COPY5_IF_LT(mv->bcost, costs[1], mv->bmx, m1x, mv->bmy, m1y, mv->bdir, p1, mv->bdist, d1);\ + }\ + if (CHECK_MV_RANGE(m2x,m2y)) {\ + COPY5_IF_LT(mv->bcost, costs[2], mv->bmx, m2x, mv->bmy, m2y, mv->bdir, p2, mv->bdist, d2);\ + }\ + if (CHECK_MV_RANGE(m3x,m3y)) {\ + COPY5_IF_LT(mv->bcost, costs[3], mv->bmx, m3x, mv->bmy, m3y, mv->bdir, p3, mv->bdist, d3);\ + }\ +} + +#define ME_COST_IPEL10_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ {\ - g_funcs.pixf.sad_x4[i_pixel](p_org,\ + g_funcs.pixf.sad10_x4[i_pixel](p_org,\ p_fref + (m0x) + (m0y) * i_fref,\ p_fref + (m1x) + (m1y) * i_fref,\ p_fref + (m2x) + (m2y) * i_fref,\ @@ -325,11 +458,18 @@ static const int i_org = FENC_STRIDE; * diamond: 1 * 1 0 1 * 1 */ -#define DIA_ITER(mx, my) \ +#define DIA_ITER8(mx, my) \ {\ omx = mx;\ omy = my;\ - ME_COST_IPEL_X4(0,-1, -1,0, 1,0, 0,1);\ + ME_COST_IPEL8_X4(0,-1, -1,0, 1,0, 0,1);\ +} + +#define DIA_ITER10(mx, my) \ +{\ + omx = mx;\ + omy = my;\ + ME_COST_IPEL10_X4(0,-1, -1,0, 1,0, 0,1);\ } @@ -341,16 +481,56 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- */ -#define ME_COST_QPEL(mx, my) \ +#define ME_COST_QPEL8(mx, my) \ +{\ + pel8_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\ + + ((my) >> 2) * i_fref + ((mx) >> 2); \ + cost = g_funcs.pixf.fpel8_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\ +} + +#define ME_COST_QPEL10(mx, my) \ {\ - pel_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\ + pel10_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\ + ((my) >> 2) * i_fref + ((mx) >> 2); \ - cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\ + cost = g_funcs.pixf.fpel10_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\ } /* --------------------------------------------------------------------------- */ -#define ME_COST_QPEL_SYM \ +#define ME_COST_QPEL8_SYM \ +{\ + int mx_sym;\ + int my_sym;\ + \ + cost = MAX_DISTORTION;\ + if (h->i_type == SLICE_TYPE_B) {\ + mx_sym = -scale_mv_skip ( mx, distance_bwd, distance_fwd);\ + my_sym = -scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\ + } else {\ + mx_sym = scale_mv_skip ( mx, distance_bwd, distance_fwd);\ + my_sym = scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\ + }\ + \ + if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_sym, my_sym)) {\ + int xx1 = mx >> 2;\ + int yy1 = my >> 2;\ + int xx2 = mx_sym >> 2;\ + int yy2 = my_sym >> 2;\ + pel8_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)]; \ + pel8_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \ + pel8_t *p_pred = buf_pixel_temp;\ + \ + if (p_src1 != NULL && p_src2 != NULL) { \ + p_src1 += i_offset + yy1 * i_fref + xx1;\ + p_src2 += i_offset + yy2 * i_fref + xx2;\ + g_funcs.pixf.avg8[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \ + cost = g_funcs.pixf.fpel8_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\ + + MV_COST_FPEL(mx, my);\ + } \ + }\ +} + +#define ME_COST_QPEL10_SYM \ {\ int mx_sym;\ int my_sym;\ @@ -369,15 +549,15 @@ static const int i_org = FENC_STRIDE; int yy1 = my >> 2;\ int xx2 = mx_sym >> 2;\ int yy2 = my_sym >> 2;\ - pel_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)]; \ - pel_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \ - pel_t *p_pred = buf_pixel_temp;\ + pel10_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)]; \ + pel10_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \ + pel10_t *p_pred = buf_pixel_temp;\ \ if (p_src1 != NULL && p_src2 != NULL) { \ p_src1 += i_offset + yy1 * i_fref + xx1;\ p_src2 += i_offset + yy2 * i_fref + xx2;\ - g_funcs.pixf.avg[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \ - cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\ + g_funcs.pixf.avg10[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \ + cost = g_funcs.pixf.fpel10_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\ + MV_COST_FPEL(mx, my);\ } \ }\ @@ -385,12 +565,24 @@ static const int i_org = FENC_STRIDE; /* --------------------------------------------------------------------------- */ -#define ME_COST_QPEL_BID \ +#define ME_COST_QPEL8_BID \ + if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_bid, my_bid)) {\ + int xx1 = mx >> 2;\ + int yy1 = my >> 2;\ + pel8_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)] + i_offset + yy1 * i_fref + xx1;\ + int distortion = g_funcs.pixf.fpel8_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\ + \ + cost = distortion + MV_COST_FPEL(mx, my) + mv_bid_bit;\ + } else {\ + cost = MAX_DISTORTION;\ + } + +#define ME_COST_QPEL10_BID \ if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_bid, my_bid)) {\ int xx1 = mx >> 2;\ int yy1 = my >> 2;\ - pel_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)] + i_offset + yy1 * i_fref + xx1;\ - int distortion = g_funcs.pixf.fpel_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\ + pel10_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)] + i_offset + yy1 * i_fref + xx1;\ + int distortion = g_funcs.pixf.fpel10_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\ \ cost = distortion + MV_COST_FPEL(mx, my) + mv_bid_bit;\ } else {\ @@ -405,9 +597,9 @@ static const int i_org = FENC_STRIDE; */ /* --------------------------------------------------------------------------- - * 若candMV超过1/4精度的阈值,则新的MV应采用2倍步长,若此时新的MV在阈值范围内,则返回1,表示新的MV不应继续搜索 - * 若candMV在1/4精度阈值范围内,则新的MV采用单倍步长,此时若新MV超过阈值范围,则返回1,表示新的MV不应继续搜索 - * 否则,返回0值表示新的MV应该继续被搜索 + * 鑻andMV瓒呰繃1/4绮惧害鐨勯槇鍊硷紝鍒欐柊鐨凪V搴旈噰鐢2鍊嶆闀匡紝鑻ユ鏃舵柊鐨凪V鍦ㄩ槇鍊艰寖鍥村唴锛屽垯杩斿洖1锛岃〃绀烘柊鐨凪V涓嶅簲缁х画鎼滅储 + * 鑻andMV鍦1/4绮惧害闃堝艰寖鍥村唴锛屽垯鏂扮殑MV閲囩敤鍗曞嶆闀匡紝姝ゆ椂鑻ユ柊MV瓒呰繃闃堝艰寖鍥达紝鍒欒繑鍥1锛岃〃绀烘柊鐨凪V涓嶅簲缁х画鎼滅储 + * 鍚﹀垯锛岃繑鍥0鍊艰〃绀烘柊鐨凪V搴旇缁х画琚悳绱 */ static int pmvr_adapt_mv(int *mx, int *my, int ctr_x, int ctr_y, int mv_x, int mv_y, int step_x, int step_y) @@ -479,11 +671,6 @@ mv_clip(int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_min[2], int mv_m static dist_t me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) { -#if !ENABLE_FRAME_SUBPEL_INTPL - ALIGN32(pel_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]); -#endif - pel_t *p_org = p_me->p_fenc; - pel_t **p_filtered = p_me->p_fref_1st->filtered; int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; int pmx = p_me->mvp.x; int pmy = p_me->mvp.y; @@ -508,8 +695,88 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) bmy = p_me->bmv.y; bmv = p_me->bmv; + if (h->param->input_sample_bit_depth == 8) { +#if !ENABLE_FRAME_SUBPEL_INTPL + ALIGN32(pel8_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]); +#endif + pel8_t *p_org = p_me->p_fenc8; + pel8_t **p_filtered = p_me->p_fref_1st->filtered8; + if (h->param->enable_hadamard) { + ME_COST_QPEL8(bmx, bmy); + bcost = cost; + } else { + bcost = p_me->bcost; + } + + /* ------------------------------------------------------------- + * half-pel refine */ + + // loop over search positions + for (pos = 1; pos < search_pos2; pos += search_step) { + mx = bmx + (search_pattern[pos][0] << 1); + my = bmy + (search_pattern[pos][1] << 1); +#if ENABLE_FRAME_SUBPEL_INTPL + ME_COST_QPEL8(mx, my); +#else + mv_t mvt; + mvt.v = MAKEDWORD(mx, my); + get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h); + mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st); + cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my); +#endif + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + + bmx = bmv.x; + bmy = bmv.y; + + /* ------------------------------------------------------------- + * quarter-pel refine */ + + if (h->use_fractional_me > 1) { + // loop over search positions + for (pos = 1; pos < search_pos4; pos += search_step) { + if (h->param->enable_pmvr) { + if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, bmx, bmy, search_pattern[pos][0], search_pattern[pos][1])) { + continue; + } + } else { + mx = bmx + search_pattern[pos][0]; // quarter-pel units + my = bmy + search_pattern[pos][1]; // quarter-pel units + } + + // set motion vector cost +#if ENABLE_FRAME_SUBPEL_INTPL + ME_COST_QPEL8(mx, my); +#else + mv_t mvt; + mvt.v = MAKEDWORD(mx, my); + get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h); + mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st); + cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my); +#endif + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + } + // save the results + p_me->bmv = bmv; + p_me->bcost = bcost; + p_me->mvcost[PDIR_FWD] = MV_COST_FPEL(bmv.x,bmv.y); + return bcost; + } else { +#if !ENABLE_FRAME_SUBPEL_INTPL + ALIGN32(pel10_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]); +#endif + pel10_t *p_org = p_me->p_fenc10; + pel10_t **p_filtered = p_me->p_fref_1st->filtered10; if (h->param->enable_hadamard) { - ME_COST_QPEL(bmx, bmy); + ME_COST_QPEL10(bmx, bmy); bcost = cost; } else { bcost = p_me->bcost; @@ -523,7 +790,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) mx = bmx + (search_pattern[pos][0] << 1); my = bmy + (search_pattern[pos][1] << 1); #if ENABLE_FRAME_SUBPEL_INTPL - ME_COST_QPEL(mx, my); + ME_COST_QPEL10(mx, my); #else mv_t mvt; mvt.v = MAKEDWORD(mx, my); @@ -557,7 +824,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) // set motion vector cost #if ENABLE_FRAME_SUBPEL_INTPL - ME_COST_QPEL(mx, my); + ME_COST_QPEL10(mx, my); #else mv_t mvt; mvt.v = MAKEDWORD(mx, my); @@ -576,6 +843,7 @@ me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) p_me->bcost = bcost; p_me->mvcost[PDIR_FWD] = MV_COST_FPEL(bmv.x,bmv.y); return bcost; + } } @@ -666,10 +934,10 @@ void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp) /* --------------------------------------------------------------------------- */ -static void tz_pattern_search(xavs2_t* h, +static void tz_pattern_search8(xavs2_t* h, xavs2_me_t *p_me, - pel_t* p_org, - pel_t* p_fref, + pel8_t* p_org, + pel8_t* p_fref, mv_info* mv, int mv_x_min, int mv_y_min, @@ -700,22 +968,22 @@ static void tz_pattern_search(xavs2_t* h, int idx; if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { - ME_COST_IPEL_X4_DIR_DIST(omx, top, 2, dist, /* direction */ + ME_COST_IPEL8_X4_DIR_DIST(omx, top, 2, dist, /* direction */ left, omy, 4, dist, /* 2 */ right, omy, 5, dist, /* 4 * 5 */ omx, bottom, 7, dist); /* 7 */ } else { if (top >= mv_y_min) { // check top - ME_COST_IPEL_DIR_DIST(omx, top, 2, dist); + ME_COST_IPEL8_DIR_DIST(omx, top, 2, dist); } if (left >= mv_x_min) { // check middle left - ME_COST_IPEL_DIR_DIST(left, omy, 4, dist); + ME_COST_IPEL8_DIR_DIST(left, omy, 4, dist); } if (right <= mv_x_max) { // check middle right - ME_COST_IPEL_DIR_DIST(right, omy, 5, dist); + ME_COST_IPEL8_DIR_DIST(right, omy, 5, dist); } if (bottom <= mv_y_max) { // check bottom - ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist); + ME_COST_IPEL8_DIR_DIST(omx, bottom, 7, dist); } } @@ -745,42 +1013,42 @@ static void tz_pattern_search(xavs2_t* h, // check border if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { - ME_COST_IPEL_X4_DIR_DIST(omx, top, 2, dist, + ME_COST_IPEL8_X4_DIR_DIST(omx, top, 2, dist, left2, top2, 1, dist >> 1, right2, top2, 3, dist >> 1, left, omy, 4, dist); - ME_COST_IPEL_X4_DIR_DIST(right, omy, 5, dist, + ME_COST_IPEL8_X4_DIR_DIST(right, omy, 5, dist, left2, bottom2, 6, dist >> 1, right2, bottom2, 8, dist >> 1, omx, bottom, 7, dist); } else { if (top >= mv_y_min) { // check top - ME_COST_IPEL_DIR_DIST(omx, top, 2, dist); + ME_COST_IPEL8_DIR_DIST(omx, top, 2, dist); } if (top2 >= mv_y_min) { // check half top if (left2 >= mv_x_min) { // check half left - ME_COST_IPEL_DIR_DIST(left2, top2, 1, (dist >> 1)); + ME_COST_IPEL8_DIR_DIST(left2, top2, 1, (dist >> 1)); } if (right2 <= mv_x_max) { // check half right - ME_COST_IPEL_DIR_DIST(right2, top2, 3, (dist >> 1)); + ME_COST_IPEL8_DIR_DIST(right2, top2, 3, (dist >> 1)); } } if (left >= mv_x_min) { // check left - ME_COST_IPEL_DIR_DIST(left, omy, 4, dist); + ME_COST_IPEL8_DIR_DIST(left, omy, 4, dist); } if (right <= mv_x_max) { // check right - ME_COST_IPEL_DIR_DIST(right, omy, 5, dist); + ME_COST_IPEL8_DIR_DIST(right, omy, 5, dist); } if (bottom2 <= mv_y_max) { // check half bottom if (left2 >= mv_x_min) { // check half left - ME_COST_IPEL_DIR_DIST(left2, bottom2, 6, (dist >> 1)); + ME_COST_IPEL8_DIR_DIST(left2, bottom2, 6, (dist >> 1)); } if (right2 <= mv_x_max) { // check half right - ME_COST_IPEL_DIR_DIST(right2, bottom2, 8, (dist >> 1)); + ME_COST_IPEL8_DIR_DIST(right2, bottom2, 8, (dist >> 1)); } } if (bottom <= mv_y_max) { // check bottom - ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist); + ME_COST_IPEL8_DIR_DIST(omx, bottom, 7, dist); } } if (mv->bcost < bcost) { @@ -809,7 +1077,7 @@ static void tz_pattern_search(xavs2_t* h, * 2 * 3 * 0 */ - ME_COST_IPEL_X4_DIR_DIST(omx, top, 0, dist, + ME_COST_IPEL8_X4_DIR_DIST(omx, top, 0, dist, left, omy, 0, dist, right, omy, 0, dist, omx, bottom, 0, dist); @@ -818,7 +1086,7 @@ static void tz_pattern_search(xavs2_t* h, posYB = bottom - ((dist >> 2) * idx); posXL = omx - ((dist >> 2) * idx); posXR = omx + ((dist >> 2) * idx); - ME_COST_IPEL_X4_DIR_DIST(posXL, posYT, 0, dist, + ME_COST_IPEL8_X4_DIR_DIST(posXL, posYT, 0, dist, posXR, posYT, 0, dist, posXL, posYB, 0, dist, posXR, posYB, 0, dist); @@ -826,16 +1094,16 @@ static void tz_pattern_search(xavs2_t* h, } else { // check border for each mv if (top >= mv_y_min) { // check top - ME_COST_IPEL_DIR_DIST(omx, top, 0, dist); + ME_COST_IPEL8_DIR_DIST(omx, top, 0, dist); } if (left >= mv_x_min) { // check left - ME_COST_IPEL_DIR_DIST(left, omy, 0, dist); + ME_COST_IPEL8_DIR_DIST(left, omy, 0, dist); } if (right <= mv_x_max) { // check right - ME_COST_IPEL_DIR_DIST(right, omy, 0, dist); + ME_COST_IPEL8_DIR_DIST(right, omy, 0, dist); } if (bottom <= mv_y_max) { // check bottom - ME_COST_IPEL_DIR_DIST(omx, bottom, 0, dist); + ME_COST_IPEL8_DIR_DIST(omx, bottom, 0, dist); } for (idx = 1; idx < 4; idx++) { @@ -846,18 +1114,18 @@ static void tz_pattern_search(xavs2_t* h, if (posYT >= mv_y_min) { // check top if (posXL >= mv_x_min) { // check left - ME_COST_IPEL_DIR_DIST(posXL, posYT, 0, dist); + ME_COST_IPEL8_DIR_DIST(posXL, posYT, 0, dist); } if (posXR <= mv_x_max) { // check right - ME_COST_IPEL_DIR_DIST(posXR, posYT, 0, dist); + ME_COST_IPEL8_DIR_DIST(posXR, posYT, 0, dist); } } if (posYB <= mv_y_max) { // check bottom if (posXL >= mv_x_min) { // check left - ME_COST_IPEL_DIR_DIST(posXL, posYB, 0, dist); + ME_COST_IPEL8_DIR_DIST(posXL, posYB, 0, dist); } if (posXR <= mv_x_max) { // check right - ME_COST_IPEL_DIR_DIST(posXR, posYB, 0, dist); + ME_COST_IPEL8_DIR_DIST(posXR, posYB, 0, dist); } } } @@ -871,57 +1139,538 @@ static void tz_pattern_search(xavs2_t* h, } } -// int g_me_time[4] = { 0 }; - -/* --------------------------------------------------------------------------- - * return minimum motion cost after search - */ -dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc) +static void tz_pattern_search10(xavs2_t* h, + xavs2_me_t *p_me, + pel10_t* p_org, + pel10_t* p_fref, + mv_info* mv, + int mv_x_min, + int mv_y_min, + int mv_x_max, + int mv_y_max, + int i_pixel, + int i_fref, + int earlyExitIters, + int merange) { - /* special version of pack to allow shortcuts in CHECK_MV_RANGE */ - ALIGNED_ARRAY_16(int, costs,[8]); - double beta2 = p_me->beta2 + 1; - double beta3 = p_me->beta3 + 1; - pel_t *p_org = p_me->p_fenc; - pel_t *p_fref = p_me->p_fref_1st->planes[IMG_Y] + p_me->i_bias; - int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; - int i_pixel = p_me->i_pixel; - int mv_x_min = p_me->mv_min_fpel[0]; - int mv_y_min = p_me->mv_min_fpel[1]; - int mv_x_max = p_me->mv_max_fpel[0]; - int mv_y_max = p_me->mv_max_fpel[1]; - int me_range = h->param->search_range; - int lambda = h->i_lambda_factor; // factor for determining Lagrangian's motion cost + ALIGN16(int costs[16]); const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; - uint32_t pmv; - dist_t bcost = MAX_DISTORTION; - int bmx = 0, bmy = 0; - int omx, omy; - int i, j, dir, idx; + int lambda = h->i_lambda_factor; + int rounds = 0; + int dist = 1; + int omx = mv->bmx; + int omy = mv->bmx; + dist_t bcost = mv->bcost; + int top = omy - dist; + int bottom = omy + dist; + int left = omx - dist; + int right = omx + dist; + int top2, bottom2, left2, right2; + int posYT, posYB, posXL, posXR; + int idx; + + if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { + ME_COST_IPEL10_X4_DIR_DIST(omx, top, 2, dist, /* direction */ + left, omy, 4, dist, /* 2 */ + right, omy, 5, dist, /* 4 * 5 */ + omx, bottom, 7, dist); /* 7 */ + } else { + if (top >= mv_y_min) { // check top + ME_COST_IPEL10_DIR_DIST(omx, top, 2, dist); + } + if (left >= mv_x_min) { // check middle left + ME_COST_IPEL10_DIR_DIST(left, omy, 4, dist); + } + if (right <= mv_x_max) { // check middle right + ME_COST_IPEL10_DIR_DIST(right, omy, 5, dist); + } + if (bottom <= mv_y_max) { // check bottom + ME_COST_IPEL10_DIR_DIST(omx, bottom, 7, dist); + } + } + + if (mv->bcost < bcost) { + rounds = 0; + } else if (++rounds >= earlyExitIters) { + return; + } + + for (dist = 2; dist <= 8; dist <<= 1) { + /* 2 points 2, 4, 5, 7 are dist + * 1 3 points 1, 3, 6, 8 are dist/2 + * 4 * 5 + * 6 8 + * 7 */ + omx = mv->bmx; + omy = mv->bmx; + bcost = mv->bcost; + top = omy - dist; + bottom = omy + dist; + left = omx - dist; + right = omx + dist; + top2 = omy - (dist >> 1); + bottom2 = omy + (dist >> 1); + left2 = omx - (dist >> 1); + right2 = omx + (dist >> 1); + + // check border + if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { + ME_COST_IPEL10_X4_DIR_DIST(omx, top, 2, dist, + left2, top2, 1, dist >> 1, + right2, top2, 3, dist >> 1, + left, omy, 4, dist); + ME_COST_IPEL10_X4_DIR_DIST(right, omy, 5, dist, + left2, bottom2, 6, dist >> 1, + right2, bottom2, 8, dist >> 1, + omx, bottom, 7, dist); + } else { + if (top >= mv_y_min) { // check top + ME_COST_IPEL10_DIR_DIST(omx, top, 2, dist); + } + if (top2 >= mv_y_min) { // check half top + if (left2 >= mv_x_min) { // check half left + ME_COST_IPEL10_DIR_DIST(left2, top2, 1, (dist >> 1)); + } + if (right2 <= mv_x_max) { // check half right + ME_COST_IPEL10_DIR_DIST(right2, top2, 3, (dist >> 1)); + } + } + if (left >= mv_x_min) { // check left + ME_COST_IPEL10_DIR_DIST(left, omy, 4, dist); + } + if (right <= mv_x_max) { // check right + ME_COST_IPEL10_DIR_DIST(right, omy, 5, dist); + } + if (bottom2 <= mv_y_max) { // check half bottom + if (left2 >= mv_x_min) { // check half left + ME_COST_IPEL10_DIR_DIST(left2, bottom2, 6, (dist >> 1)); + } + if (right2 <= mv_x_max) { // check half right + ME_COST_IPEL10_DIR_DIST(right2, bottom2, 8, (dist >> 1)); + } + } + if (bottom <= mv_y_max) { // check bottom + ME_COST_IPEL10_DIR_DIST(omx, bottom, 7, dist); + } + } + if (mv->bcost < bcost) { + rounds = 0; + } else if (++rounds >= earlyExitIters) { + return; + } + } + + for (dist = 16; dist <= merange; dist <<= 1) { + omx = mv->bmx; + omy = mv->bmx; + bcost = mv->bcost; + top = omy - dist; + bottom = omy + dist; + left = omx - dist; + right = omx + dist; + + if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { // check border + /* index: 0 + * 3 + * 2 + * 1 + * 0 3 2 1 * 1 2 3 0 + * 1 + * 2 + * 3 + * 0 */ + ME_COST_IPEL10_X4_DIR_DIST(omx, top, 0, dist, + left, omy, 0, dist, + right, omy, 0, dist, + omx, bottom, 0, dist); + for (idx = 1; idx < 4; idx++) { + posYT = top + ((dist >> 2) * idx); + posYB = bottom - ((dist >> 2) * idx); + posXL = omx - ((dist >> 2) * idx); + posXR = omx + ((dist >> 2) * idx); + ME_COST_IPEL10_X4_DIR_DIST(posXL, posYT, 0, dist, + posXR, posYT, 0, dist, + posXL, posYB, 0, dist, + posXR, posYB, 0, dist); + } + } else { + // check border for each mv + if (top >= mv_y_min) { // check top + ME_COST_IPEL10_DIR_DIST(omx, top, 0, dist); + } + if (left >= mv_x_min) { // check left + ME_COST_IPEL10_DIR_DIST(left, omy, 0, dist); + } + if (right <= mv_x_max) { // check right + ME_COST_IPEL10_DIR_DIST(right, omy, 0, dist); + } + if (bottom <= mv_y_max) { // check bottom + ME_COST_IPEL10_DIR_DIST(omx, bottom, 0, dist); + } + + for (idx = 1; idx < 4; idx++) { + posYT = top + ((dist >> 2) * idx); + posYB = bottom - ((dist >> 2) * idx); + posXL = omx - ((dist >> 2) * idx); + posXR = omx + ((dist >> 2) * idx); + + if (posYT >= mv_y_min) { // check top + if (posXL >= mv_x_min) { // check left + ME_COST_IPEL10_DIR_DIST(posXL, posYT, 0, dist); + } + if (posXR <= mv_x_max) { // check right + ME_COST_IPEL10_DIR_DIST(posXR, posYT, 0, dist); + } + } + if (posYB <= mv_y_max) { // check bottom + if (posXL >= mv_x_min) { // check left + ME_COST_IPEL10_DIR_DIST(posXL, posYB, 0, dist); + } + if (posXR <= mv_x_max) { // check right + ME_COST_IPEL10_DIR_DIST(posXR, posYB, 0, dist); + } + } + } + } + + if (mv->bcost < bcost) { + rounds = 0; + } else if (++rounds >= earlyExitIters) { + return; + } + } +} + +// int g_me_time[4] = { 0 }; + +/* --------------------------------------------------------------------------- + * return minimum motion cost after search + */ +dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc) +{ + /* special version of pack to allow shortcuts in CHECK_MV_RANGE */ + ALIGNED_ARRAY_16(int, costs,[8]); + double beta2 = p_me->beta2 + 1; + double beta3 = p_me->beta3 + 1; + int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; + int i_pixel = p_me->i_pixel; + int mv_x_min = p_me->mv_min_fpel[0]; + int mv_y_min = p_me->mv_min_fpel[1]; + int mv_x_max = p_me->mv_max_fpel[0]; + int mv_y_max = p_me->mv_max_fpel[1]; + int me_range = h->param->search_range; + int lambda = h->i_lambda_factor; // factor for determining Lagrangian's motion cost + const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); + const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; + const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; + const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; + uint32_t pmv; + dist_t bcost = MAX_DISTORTION; + int bmx = 0, bmy = 0; + int omx, omy; + int i, j, dir, idx; + + const int umh_1_3_step = h->UMH_big_hex_level == 2 ? 16 : 8; + const int8_t(*search_patern)[2] = h->UMH_big_hex_level == 2 ? HEX4 : FAST_HEX4; + + // g_me_time[0]++; + /* ------------------------------------------------------------- + * try MVP and some key searching points */ + pmv = MAKEDWORD(mvc[0][0], mvc[0][1]); /* mvc[0][] is the MVP */ + + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_org = p_me->p_fenc8; + pel8_t *p_fref = p_me->p_fref_1st->planes8[IMG_Y] + p_me->i_bias; + for (i = 0; i < i_mvc; i++) { + int mx = mvc[i][0]; + int my = mvc[i][1]; + ME_COST_IPEL8(mx, my); + } + + if (bcost == MAX_DISTORTION) { + goto _me_error8; /* me failed */ + } + + /* ------------------------------------------------------------- + * search using different method */ + + switch (h->param->me_method) { + case XAVS2_ME_TZ: { /* TZ */ + const int RasterDistance = 16; + const int MaxIters = 32; + const int EarlyExitIters = 3; + dist_t bdist; + int mv1_x, mv1_y, mv2_x, mv2_y; + mv_info mvinfo; + + omx = bmx; + omy = bmy; + ME_COST_IPEL8_X3(-2, 0, -1, 2, 1, 2); + ME_COST_IPEL8_X3( 2, 0, 1, -2, -1, -2); + + if (CHECK_MV_RANGE(bmx, bmy)) { + DIA_ITER8(bmx, bmy); + } + + mvinfo.bcost = bcost; + mvinfo.bdist = 0; + mvinfo.bmx = bmx; + mvinfo.bmy = bmy; + mvinfo.bdir = 0; + tz_pattern_search8(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range); + bcost = mvinfo.bcost; + bdist = mvinfo.bdist; + bmx = mvinfo.bmx; + bmy = mvinfo.bmy; + dir = mvinfo.bdir; + + if (bdist == 1) { + if (!dir) { + break; + } + + /* if best distance was only 1, check two missing points. + * for a given direction 1 to 8, check nearest two outer X pixels*/ + mv1_x = bmx + offsets[(dir - 1) * 2 ][0]; /* X X */ + mv1_y = bmy + offsets[(dir - 1) * 2 ][1]; /* X 1 2 3 X */ + mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* 4 * 5 */ + mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X 6 7 8 X */ + if (CHECK_MV_RANGE(mv1_x, mv1_y)) { /* X X */ + ME_COST_IPEL8(mv1_x, mv1_y); + } + if (CHECK_MV_RANGE(mv2_x, mv2_y)) { + ME_COST_IPEL8(mv2_x, mv2_y); + } + + /* if no new point is found, stop */ + if (bcost == mvinfo.bcost) { + break; /* the bcost is not changed */ + } + } + + /* raster search refinement if original search distance was too big */ + if (bdist > RasterDistance) { + const int iRasterDist = RasterDistance >> 1; + const int iRasterDist2 = RasterDistance >> 2; + int rmv_y_min = XAVS2_MAX(mv_y_min, bmy - RasterDistance + 2); + int rmv_y_max = XAVS2_MIN(mv_y_max, bmy + RasterDistance - 2); + int rmv_x_min = XAVS2_MAX(mv_x_min, bmx - RasterDistance + 2); + int rmv_x_max = XAVS2_MIN(mv_x_max, bmx + RasterDistance - 2); + for (j = rmv_y_min; j < rmv_y_max; j += iRasterDist) { + for (i = rmv_x_min; i < rmv_x_max; i += iRasterDist) { + ME_COST_IPEL8_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2); + } + } + } + + while (bdist > 0) { + // center a new search around current best + mvinfo.bcost = bcost; + mvinfo.bdist = 0; + mvinfo.bmx = bmx; + mvinfo.bmy = bmy; + mvinfo.bdir = 0; + tz_pattern_search8(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range); + bcost = mvinfo.bcost; + bdist = mvinfo.bdist; + bmx = mvinfo.bmx; + bmy = mvinfo.bmy; + dir = mvinfo.bdir; + + if (bdist == 1) { + /* for a given direction 1 to 8, check nearest 2 outer X pixels */ + if (dir) { /* X X */ + mv1_x = bmx + offsets[(dir - 1) * 2 ][0]; /* X 1 2 3 X */ + mv1_y = bmy + offsets[(dir - 1) * 2 ][1]; /* 4 * 5 */ + mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* X 6 7 8 X */ + mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X X */ + if (CHECK_MV_RANGE(mv1_x, mv1_y)) { + ME_COST_IPEL8(mv1_x, mv1_y); + } + if (CHECK_MV_RANGE(mv2_x, mv2_y)) { + ME_COST_IPEL8(mv2_x, mv2_y); + } + } + break; + } + } + + /* equivalent to the above, but eliminates duplicate candidates */ + goto umh_step8_2; + } + case XAVS2_ME_UMH: /* UMH */ + /* http://www.cnblogs.com/TaigaCon/archive/2014/06/16/3788984.html + * 0. 鍒濆鐐规悳绱 */ + DIA_ITER8(mvc[0][0], mvc[0][1]); + if (pmv && (bmx != mvc[0][0] || bmy != mvc[0][1])) { + DIA_ITER8(bmx, bmy); + pmv = MAKEDWORD(bmx, bmy); + } + + // select different step according to the different cost from upper layer + if (p_me->mvp1.v != 0) { + int mx = IPEL(p_me->mvp1.x); + int my = IPEL(p_me->mvp1.y); + ME_COST_IPEL8(mx, my); + } + EARLY_TERMINATION8(p_me->pred_sad_uplayer); + // g_me_time[1]++; + + // prediction using mv of last ref_idx motion vector + if (p_me->i_ref_idx > 0) { + ME_COST_IPEL8(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y)); + } + if (p_me->mvp3.v != 0) { + ME_COST_IPEL8(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y)); + } + + /* 褰撳墠鏈浼楳V涓嶆槸 MVP锛屾悳绱㈠叾鍛ㄥ洿涓涓皬绐楀彛 */ + if (pmv != (uint32_t)MAKEDWORD(bmx, bmy)) { + DIA_ITER8(bmx, bmy); + } + + // early termination algorithm + EARLY_TERMINATION8(p_me->pred_sad); + + // umh_step_1: + /* UMH 1. Unsymmetrical-cross search 锛堥潪瀵圭О鍗佸瓧鎼滅储锛 */ + // g_me_time[2]++; + omx = bmx; + omy = bmy; + for (i = 1; i <= me_range; i += 2) { + ME_COST_IPEL8(omx + i, omy); + ME_COST_IPEL8(omx - i, omy); + } + for (j = 1; j <= me_range / 2; j += 2) { + ME_COST_IPEL8(omx, omy + j); + ME_COST_IPEL8(omx, omy - j); + } + + // early termination algorithm + EARLY_TERMINATION8(p_me->pred_sad); + + /* UMH 2. Spiral search 锛堣灪鏃嬫悳绱級 */ + omx = bmx; + omy = bmy; + for (i = 0; i < 24; i++) { + ME_COST_IPEL8(omx + GRID[i][0], omy + GRID[i][1]); + } + + // early termination algorithm + EARLY_TERMINATION8(p_me->pred_sad); + + // big hexagon + if (h->UMH_big_hex_level) { + for (j = 1; j <= me_range / 4; j++) { + omx = bmx; + omy = bmy; + for (i = 0; i < umh_1_3_step; i++) { + ME_COST_IPEL8(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j); + } + if (bmx != omx || bmy != omy) { + EARLY_TERMINATION8(p_me->pred_sad); + } + } + } + /* !!! NO break statement here */ + case XAVS2_ME_HEX: /* hexagon search */ +umh_step8_2 : /* UMH 3. Uneven Multi-Hexagon-grid Search 锛堜笉瑙勫緥鍏竟褰㈡ā鏉挎悳绱級 */ + // g_me_time[3]++; + dir = 0; /* 6 5 */ + omx = bmx; /* */ + omy = bmy; /* 1 * 4 */ + ME_COST_IPEL8_X3_DIR(-1,-2,6, 1,-2,5, -2,0,1); /* */ + ME_COST_IPEL8_X3_DIR( 2, 0,4, -1, 2,2, 1,2,3); /* 2 3 */ - const int umh_1_3_step = h->UMH_big_hex_level == 2 ? 16 : 8; - const int8_t(*search_patern)[2] = h->UMH_big_hex_level == 2 ? HEX4 : FAST_HEX4; + if (dir) { + const int8_t (*hex)[2]; + /* UMH 4. Extended Hexagon-based Search 锛堝叚杈瑰舰妯℃澘鍙嶅鎼滅储锛 */ + idx = dir - 1; /* start array index */ + /* half hexagon, not overlapping the previous iteration */ + for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) { + dir = 0; + omx = bmx; + omy = bmy; + hex = &HEX2[idx]; + ME_COST_IPEL8_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3); + if (!dir) { + break; /* early terminate */ + } + idx = M1MOD6[dir + idx - 1]; /* next start array index */ + } + } + /* !!! NO break statement here */ + case XAVS2_ME_DIA: /* diamond search */ +umh_step8_3: /* UMH 5. the third step with a small search pattern 锛堝皬鑿卞舰妯℃澘鍙嶅鎼滅储锛 */ + dir = 0; + if (CHECK_MV_RANGE(bmx, bmy)) { + omx = bmx; /* 4 */ + omy = bmy; /* 1 * 3 */ + ME_COST_IPEL8_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2); /* 2 */ + } + if (dir) { + const int8_t (*dia)[2]; + idx = dir - 1; /* start array index */ + /* half diamond, not overlapping the previous iteration */ + for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) { + dir = 0; + omx = bmx; + omy = bmy; + dia = &DIA1[idx]; + ME_COST_IPEL8_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3); + if (!dir) { + break; /* early terminate */ + } + idx = M1MOD4[dir + idx - 1]; /* next start array index */ + } + } + break; + default: /* XAVS2_ME_FS: full search */ + omx = bmx; + omy = bmy; + for (j = -me_range; j < me_range; j++) { + for (i = -me_range; i < me_range; i++) { + ME_COST_IPEL8(omx + i, omy + j); + } + } + break; + } - // g_me_time[0]++; /* ------------------------------------------------------------- - * try MVP and some key searching points */ - pmv = MAKEDWORD(mvc[0][0], mvc[0][1]); /* mvc[0][] is the MVP */ + * store the results of fullpel search */ + p_me->bmv.v = MAKEDWORD(FPEL(bmx), FPEL(bmy)); + p_me->bmv2.v = MAKEDWORD(bmx, bmy); + p_me->bcost = bcost; + p_me->bcost2 = bcost; + p_me->mvcost[PDIR_FWD] = MV_COST_IPEL(bmx, bmy); + + /* ------------------------------------------------------------- + * sub-pel refine */ + if (h->use_fractional_me) { + bcost = me_subpel_refine(h, p_me); + } +_me_error8: + return bcost; + } else { + pel10_t *p_org = p_me->p_fenc10; + pel10_t *p_fref = p_me->p_fref_1st->planes10[IMG_Y] + p_me->i_bias; for (i = 0; i < i_mvc; i++) { int mx = mvc[i][0]; int my = mvc[i][1]; - ME_COST_IPEL(mx, my); + ME_COST_IPEL10(mx, my); } if (bcost == MAX_DISTORTION) { - goto _me_error; /* me failed */ + goto _me_error10; /* me failed */ } /* ------------------------------------------------------------- * search using different method */ + switch (h->param->me_method) { case XAVS2_ME_TZ: { /* TZ */ const int RasterDistance = 16; @@ -933,11 +1682,11 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc omx = bmx; omy = bmy; - ME_COST_IPEL_X3(-2, 0, -1, 2, 1, 2); - ME_COST_IPEL_X3( 2, 0, 1, -2, -1, -2); + ME_COST_IPEL10_X3(-2, 0, -1, 2, 1, 2); + ME_COST_IPEL10_X3( 2, 0, 1, -2, -1, -2); if (CHECK_MV_RANGE(bmx, bmy)) { - DIA_ITER(bmx, bmy); + DIA_ITER10(bmx, bmy); } mvinfo.bcost = bcost; @@ -945,7 +1694,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc mvinfo.bmx = bmx; mvinfo.bmy = bmy; mvinfo.bdir = 0; - tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range); + tz_pattern_search10(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range); bcost = mvinfo.bcost; bdist = mvinfo.bdist; bmx = mvinfo.bmx; @@ -964,10 +1713,10 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* 4 * 5 */ mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X 6 7 8 X */ if (CHECK_MV_RANGE(mv1_x, mv1_y)) { /* X X */ - ME_COST_IPEL(mv1_x, mv1_y); + ME_COST_IPEL10(mv1_x, mv1_y); } if (CHECK_MV_RANGE(mv2_x, mv2_y)) { - ME_COST_IPEL(mv2_x, mv2_y); + ME_COST_IPEL10(mv2_x, mv2_y); } /* if no new point is found, stop */ @@ -986,7 +1735,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc int rmv_x_max = XAVS2_MIN(mv_x_max, bmx + RasterDistance - 2); for (j = rmv_y_min; j < rmv_y_max; j += iRasterDist) { for (i = rmv_x_min; i < rmv_x_max; i += iRasterDist) { - ME_COST_IPEL_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2); + ME_COST_IPEL10_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2); } } } @@ -998,7 +1747,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc mvinfo.bmx = bmx; mvinfo.bmy = bmy; mvinfo.bdir = 0; - tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range); + tz_pattern_search10(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range); bcost = mvinfo.bcost; bdist = mvinfo.bdist; bmx = mvinfo.bmx; @@ -1013,10 +1762,10 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* X 6 7 8 X */ mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X X */ if (CHECK_MV_RANGE(mv1_x, mv1_y)) { - ME_COST_IPEL(mv1_x, mv1_y); + ME_COST_IPEL10(mv1_x, mv1_y); } if (CHECK_MV_RANGE(mv2_x, mv2_y)) { - ME_COST_IPEL(mv2_x, mv2_y); + ME_COST_IPEL10(mv2_x, mv2_y); } } break; @@ -1024,14 +1773,14 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc } /* equivalent to the above, but eliminates duplicate candidates */ - goto umh_step_2; + goto umh_step10_2; } case XAVS2_ME_UMH: /* UMH */ /* http://www.cnblogs.com/TaigaCon/archive/2014/06/16/3788984.html - * 0. 初始点搜索 */ - DIA_ITER(mvc[0][0], mvc[0][1]); + * 0. 鍒濆鐐规悳绱 */ + DIA_ITER10(mvc[0][0], mvc[0][1]); if (pmv && (bmx != mvc[0][0] || bmy != mvc[0][1])) { - DIA_ITER(bmx, bmy); + DIA_ITER10(bmx, bmy); pmv = MAKEDWORD(bmx, bmy); } @@ -1039,53 +1788,53 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc if (p_me->mvp1.v != 0) { int mx = IPEL(p_me->mvp1.x); int my = IPEL(p_me->mvp1.y); - ME_COST_IPEL(mx, my); + ME_COST_IPEL10(mx, my); } - EARLY_TERMINATION(p_me->pred_sad_uplayer); + EARLY_TERMINATION10(p_me->pred_sad_uplayer); // g_me_time[1]++; // prediction using mv of last ref_idx motion vector if (p_me->i_ref_idx > 0) { - ME_COST_IPEL(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y)); + ME_COST_IPEL10(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y)); } if (p_me->mvp3.v != 0) { - ME_COST_IPEL(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y)); + ME_COST_IPEL10(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y)); } - /* 当前最优MV不是 MVP,搜索其周围一个小窗口 */ + /* 褰撳墠鏈浼楳V涓嶆槸 MVP锛屾悳绱㈠叾鍛ㄥ洿涓涓皬绐楀彛 */ if (pmv != (uint32_t)MAKEDWORD(bmx, bmy)) { - DIA_ITER(bmx, bmy); + DIA_ITER10(bmx, bmy); } // early termination algorithm - EARLY_TERMINATION(p_me->pred_sad); + EARLY_TERMINATION10(p_me->pred_sad); // umh_step_1: - /* UMH 1. Unsymmetrical-cross search (非对称十字搜索) */ + /* UMH 1. Unsymmetrical-cross search 锛堥潪瀵圭О鍗佸瓧鎼滅储锛 */ // g_me_time[2]++; omx = bmx; omy = bmy; for (i = 1; i <= me_range; i += 2) { - ME_COST_IPEL(omx + i, omy); - ME_COST_IPEL(omx - i, omy); + ME_COST_IPEL10(omx + i, omy); + ME_COST_IPEL10(omx - i, omy); } for (j = 1; j <= me_range / 2; j += 2) { - ME_COST_IPEL(omx, omy + j); - ME_COST_IPEL(omx, omy - j); + ME_COST_IPEL10(omx, omy + j); + ME_COST_IPEL10(omx, omy - j); } // early termination algorithm - EARLY_TERMINATION(p_me->pred_sad); + EARLY_TERMINATION10(p_me->pred_sad); - /* UMH 2. Spiral search (螺旋搜索) */ + /* UMH 2. Spiral search 锛堣灪鏃嬫悳绱級 */ omx = bmx; omy = bmy; for (i = 0; i < 24; i++) { - ME_COST_IPEL(omx + GRID[i][0], omy + GRID[i][1]); + ME_COST_IPEL10(omx + GRID[i][0], omy + GRID[i][1]); } // early termination algorithm - EARLY_TERMINATION(p_me->pred_sad); + EARLY_TERMINATION10(p_me->pred_sad); // big hexagon if (h->UMH_big_hex_level) { @@ -1093,26 +1842,26 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc omx = bmx; omy = bmy; for (i = 0; i < umh_1_3_step; i++) { - ME_COST_IPEL(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j); + ME_COST_IPEL10(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j); } if (bmx != omx || bmy != omy) { - EARLY_TERMINATION(p_me->pred_sad); + EARLY_TERMINATION10(p_me->pred_sad); } } } /* !!! NO break statement here */ case XAVS2_ME_HEX: /* hexagon search */ -umh_step_2 : /* UMH 3. Uneven Multi-Hexagon-grid Search (不规律六边形模板搜索) */ +umh_step10_2 : /* UMH 3. Uneven Multi-Hexagon-grid Search 锛堜笉瑙勫緥鍏竟褰㈡ā鏉挎悳绱級 */ // g_me_time[3]++; dir = 0; /* 6 5 */ omx = bmx; /* */ omy = bmy; /* 1 * 4 */ - ME_COST_IPEL_X3_DIR(-1,-2,6, 1,-2,5, -2,0,1); /* */ - ME_COST_IPEL_X3_DIR( 2, 0,4, -1, 2,2, 1,2,3); /* 2 3 */ + ME_COST_IPEL10_X3_DIR(-1,-2,6, 1,-2,5, -2,0,1); /* */ + ME_COST_IPEL10_X3_DIR( 2, 0,4, -1, 2,2, 1,2,3); /* 2 3 */ if (dir) { const int8_t (*hex)[2]; - /* UMH 4. Extended Hexagon-based Search (六边形模板反复搜索) */ + /* UMH 4. Extended Hexagon-based Search 锛堝叚杈瑰舰妯℃澘鍙嶅鎼滅储锛 */ idx = dir - 1; /* start array index */ /* half hexagon, not overlapping the previous iteration */ for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) { @@ -1120,7 +1869,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc omx = bmx; omy = bmy; hex = &HEX2[idx]; - ME_COST_IPEL_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3); + ME_COST_IPEL10_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3); if (!dir) { break; /* early terminate */ } @@ -1129,12 +1878,12 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc } /* !!! NO break statement here */ case XAVS2_ME_DIA: /* diamond search */ -umh_step_3: /* UMH 5. the third step with a small search pattern (小菱形模板反复搜索) */ +umh_step10_3: /* UMH 5. the third step with a small search pattern 锛堝皬鑿卞舰妯℃澘鍙嶅鎼滅储锛 */ dir = 0; if (CHECK_MV_RANGE(bmx, bmy)) { omx = bmx; /* 4 */ omy = bmy; /* 1 * 3 */ - ME_COST_IPEL_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2); /* 2 */ + ME_COST_IPEL10_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2); /* 2 */ } if (dir) { const int8_t (*dia)[2]; @@ -1145,7 +1894,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc omx = bmx; omy = bmy; dia = &DIA1[idx]; - ME_COST_IPEL_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3); + ME_COST_IPEL10_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3); if (!dir) { break; /* early terminate */ } @@ -1158,7 +1907,7 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc omy = bmy; for (j = -me_range; j < me_range; j++) { for (i = -me_range; i < me_range; i++) { - ME_COST_IPEL(omx + i, omy + j); + ME_COST_IPEL10(omx + i, omy + j); } } break; @@ -1178,8 +1927,9 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc bcost = me_subpel_refine(h, p_me); } -_me_error: +_me_error10: return bcost; + } } @@ -1187,13 +1937,98 @@ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc * find motion vector for forward dual hypothesis prediction (sub-pel search) * return minimum motion cost after search */ -dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv) +dist_t xavs2_me_search_sym8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *mv) +{ + const int search_pos2 = 5; // search positions for half-pel search (default: 9) + const int search_pos4 = 5; // search positions for quarter-pel search (default: 9) + pel8_t **p_filtered1 = p_me->p_fref_1st->filtered8; + pel8_t **p_filtered2 = p_me->p_fref_2nd->filtered8; + pel8_t *p_org = p_me->p_fenc8; + int distance_fwd = p_me->i_distance_1st; + int distance_bwd = p_me->i_distance_2nd; + int i_pixel = p_me->i_pixel; + int i_offset = p_me->i_bias; + int ctr_x = (p_me->mvp1.x >> 1) << 1; + int ctr_y = (p_me->mvp1.y >> 1) << 1; + int mv_x_min = p_me->mv_min[0]; + int mv_y_min = p_me->mv_min[1]; + int mv_x_max = p_me->mv_max[0]; + int mv_y_max = p_me->mv_max[1]; + int lambda = h->i_lambda_factor; + int min_pos2 = (h->param->enable_hadamard ? 0 : 1); + int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2); + const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); + const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; + const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; + const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; + mv_t bmv = *mv; // best mv + dist_t bcost = MAX_DISTORTION; + dist_t cost; + int pos; + int mx, my; + int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; + + if (!h->use_fractional_me) { + mx = mv->x; + my = mv->y; + + ME_COST_QPEL8_SYM; + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + return bcost; + } + + // loop over search positions + for (pos = min_pos2; pos < max_pos2; pos++) { + mx = mv->x + (Spiral[pos][0] << 1); // quarter-pel units + my = mv->y + (Spiral[pos][1] << 1); // quarter-pel units + + ME_COST_QPEL8_SYM; + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + + mv->v = bmv.v; + + /* ------------------------------------------------------------- + * quarter-pel refine */ + + // loop over search positions + if (h->use_fractional_me >= 2) { + for (pos = 1; pos < search_pos4; pos++) { + if (h->param->enable_pmvr) { + if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, mv->x, mv->y, Spiral[pos][0], Spiral[pos][1])) { + continue; + } + } else { + mx = mv->x + Spiral[pos][0]; // quarter-pel units + my = mv->y + Spiral[pos][1]; // quarter-pel units + } + + ME_COST_QPEL8_SYM; + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + } + + mv->v = bmv.v; + p_me->mvcost[PDIR_SYM] = MV_COST_FPEL(bmv.x, bmv.y); + + // return minimum motion cost + return bcost; +} + +dist_t xavs2_me_search_sym10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *mv) { const int search_pos2 = 5; // search positions for half-pel search (default: 9) const int search_pos4 = 5; // search positions for quarter-pel search (default: 9) - pel_t **p_filtered1 = p_me->p_fref_1st->filtered; - pel_t **p_filtered2 = p_me->p_fref_2nd->filtered; - pel_t *p_org = p_me->p_fenc; + pel10_t **p_filtered1 = p_me->p_fref_1st->filtered10; + pel10_t **p_filtered2 = p_me->p_fref_2nd->filtered10; + pel10_t *p_org = p_me->p_fenc10; int distance_fwd = p_me->i_distance_1st; int distance_bwd = p_me->i_distance_2nd; int i_pixel = p_me->i_pixel; @@ -1222,7 +2057,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mx = mv->x; my = mv->y; - ME_COST_QPEL_SYM; + ME_COST_QPEL10_SYM; bcost = cost; bmv.v = MAKEDWORD(mx, my); return bcost; @@ -1233,7 +2068,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mx = mv->x + (Spiral[pos][0] << 1); // quarter-pel units my = mv->y + (Spiral[pos][1] << 1); // quarter-pel units - ME_COST_QPEL_SYM; + ME_COST_QPEL10_SYM; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); @@ -1257,7 +2092,7 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, my = mv->y + Spiral[pos][1]; // quarter-pel units } - ME_COST_QPEL_SYM; + ME_COST_QPEL10_SYM; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); @@ -1275,11 +2110,127 @@ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, /* --------------------------------------------------------------------------- * return minimum motion cost after search (sub-pel search) */ -dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc) +dist_t xavs2_me_search_bid8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc) +{ + pel8_t **p_filtered1 = p_me->p_fref_1st->filtered8; + pel8_t **p_filtered2 = p_me->p_fref_2nd->filtered8; + pel8_t *p_org = p_me->p_fenc8; + const int search_pos2 = 9; // search positions for half-pel search (default: 9) + const int search_pos4 = 9; // search positions for quarter-pel search (default: 9) + int i_pixel = p_me->i_pixel; + int i_offset = p_me->i_bias; + int ctr_x = (p_me->mvp1.x >> 1) << 1; + int ctr_y = (p_me->mvp1.y >> 1) << 1; + int mv_x_min = p_me->mv_min[0]; + int mv_y_min = p_me->mv_min[1]; + int mv_x_max = p_me->mv_max[0]; + int mv_y_max = p_me->mv_max[1]; + int lambda = h->i_lambda_factor; + int min_pos2 = (h->param->enable_hadamard ? 0 : 1); + int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2); + int block_w = p_me->i_block_w; + int xx2; + int yy2; + int mv_bid_bit; + const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); + const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; + const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp1.x; + const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp1.y; + const uint16_t *p_cost_bix = h->mvbits - p_me->mvp2.x; + const uint16_t *p_cost_biy = h->mvbits - p_me->mvp2.y; + mv_t bmv = *fwd_mv; // best mv + dist_t bcost = MAX_DISTORTION; + dist_t cost; + int mx, my, mx_bid, my_bid; + int pos; + int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; + coeff_t *cur_blk = p_enc->coeff_blk; + + mx_bid = bwd_mv->x; + my_bid = bwd_mv->y; + + //鍦ㄨ繖閲屾妸缂栫爜鍊间笌棰勬祴鍊肩殑璁$畻鍏紡鎹㈢畻涓2鍊嶇紪鐮佸-鍚庡悜棰勬祴鍊 + xx2 = mx_bid >> 2; + yy2 = my_bid >> 2; + mv_bid_bit = MV_COST_FPEL_BID(mx_bid, my_bid); + + if (CHECK_MV_RANGE(mx_bid, my_bid)) { + pel8_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)]; + + if (p_src2 != NULL) { + p_src2 += i_offset + yy2 * i_fref + xx2; + g_funcs.pixf.sub_ps8[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A + } else { + ALIGN32(pel8_t tmp_pred8[MAX_CU_SIZE * MAX_CU_SIZE]); + mv_t mvt; + mvt.x = (int16_t)mx_bid; + mvt.y = (int16_t)my_bid; + get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, block_w, p_me->i_block_h); + mc_luma8(h, tmp_pred8, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd); + g_funcs.pixf.sub_ps8[i_pixel](cur_blk, block_w, p_org, tmp_pred8, FENC_STRIDE, MAX_CU_SIZE);//M-A + } + g_funcs.pixf.add_ps8[i_pixel](h, buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M + } + + if (!h->use_fractional_me) { + mx = fwd_mv->x; + my = fwd_mv->y; + + ME_COST_QPEL8_BID; + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + return bcost; + } + + // loop over search positions + for (pos = min_pos2; pos < max_pos2; pos++) { + mx = fwd_mv->x + (Spiral[pos][0] << 1); // quarter-pel units + my = fwd_mv->y + (Spiral[pos][1] << 1); // quarter-pel units + + ME_COST_QPEL8_BID; + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + + fwd_mv->v = bmv.v; + + /* ------------------------------------------------------------- + * quarter-pel refine */ + + // loop over search positions + if (h->use_fractional_me >= 2) { + for (pos = 1; pos < search_pos4; pos++) { + if (h->param->enable_pmvr) { + if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, fwd_mv->x, fwd_mv->y, Spiral[pos][0], Spiral[pos][1])) { + continue; + } + } else { + mx = fwd_mv->x + Spiral[pos][0]; // quarter-pel units + my = fwd_mv->y + Spiral[pos][1]; // quarter-pel units + } + + ME_COST_QPEL8_BID; + if (cost < bcost) { + bcost = cost; + bmv.v = MAKEDWORD(mx, my); + } + } + } + + fwd_mv->v = bmv.v; + p_me->mvcost[PDIR_BID] = MV_COST_FPEL(bmv.x, bmv.y) + MV_COST_FPEL_BID(mx_bid, my_bid); + + // return minimum motion cost + return bcost; +} + +dist_t xavs2_me_search_bid10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc) { - pel_t **p_filtered1 = p_me->p_fref_1st->filtered; - pel_t **p_filtered2 = p_me->p_fref_2nd->filtered; - pel_t *p_org = p_me->p_fenc; + pel10_t **p_filtered1 = p_me->p_fref_1st->filtered10; + pel10_t **p_filtered2 = p_me->p_fref_2nd->filtered10; + pel10_t *p_org = p_me->p_fenc10; const int search_pos2 = 9; // search positions for half-pel search (default: 9) const int search_pos4 = 9; // search positions for quarter-pel search (default: 9) int i_pixel = p_me->i_pixel; @@ -1314,34 +2265,34 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mx_bid = bwd_mv->x; my_bid = bwd_mv->y; - //在这里把编码值与预测值的计算公式换算为2倍编码值-后向预测值 + //鍦ㄨ繖閲屾妸缂栫爜鍊间笌棰勬祴鍊肩殑璁$畻鍏紡鎹㈢畻涓2鍊嶇紪鐮佸-鍚庡悜棰勬祴鍊 xx2 = mx_bid >> 2; yy2 = my_bid >> 2; mv_bid_bit = MV_COST_FPEL_BID(mx_bid, my_bid); if (CHECK_MV_RANGE(mx_bid, my_bid)) { - pel_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)]; + pel10_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)]; if (p_src2 != NULL) { p_src2 += i_offset + yy2 * i_fref + xx2; - g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A + g_funcs.pixf.sub_ps10[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A } else { - ALIGN32(pel_t tmp_pred[MAX_CU_SIZE * MAX_CU_SIZE]); + ALIGN32(pel10_t tmp_pred10[MAX_CU_SIZE * MAX_CU_SIZE]); mv_t mvt; mvt.x = (int16_t)mx_bid; mvt.y = (int16_t)my_bid; get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, block_w, p_me->i_block_h); - mc_luma(tmp_pred, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd); - g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, tmp_pred, FENC_STRIDE, MAX_CU_SIZE);//M-A + mc_luma10(h, tmp_pred10, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd); + g_funcs.pixf.sub_ps10[i_pixel](cur_blk, block_w, p_org, tmp_pred10, FENC_STRIDE, MAX_CU_SIZE);//M-A } - g_funcs.pixf.add_ps[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M + g_funcs.pixf.add_ps10[i_pixel](h, buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M } if (!h->use_fractional_me) { mx = fwd_mv->x; my = fwd_mv->y; - ME_COST_QPEL_BID; + ME_COST_QPEL10_BID; bcost = cost; bmv.v = MAKEDWORD(mx, my); return bcost; @@ -1352,7 +2303,7 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mx = fwd_mv->x + (Spiral[pos][0] << 1); // quarter-pel units my = fwd_mv->y + (Spiral[pos][1] << 1); // quarter-pel units - ME_COST_QPEL_BID; + ME_COST_QPEL10_BID; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); @@ -1376,7 +2327,7 @@ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, my = fwd_mv->y + Spiral[pos][1]; // quarter-pel units } - ME_COST_QPEL_BID; + ME_COST_QPEL10_BID; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); diff --git a/source/encoder/me.h b/source/encoder/me.h index 2d88fb2..1eb0e93 100644 --- a/source/encoder/me.h +++ b/source/encoder/me.h @@ -120,9 +120,13 @@ void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp); #define xavs2_me_search FPFX(me_search) dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc); -#define xavs2_me_search_sym FPFX(me_search_sym) -dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv); -#define xavs2_me_search_bid FPFX(me_search_bid) -dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc); +#define xavs2_me_search_sym8 FPFX(me_search_sym8) +dist_t xavs2_me_search_sym8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *mv); +#define xavs2_me_search_sym10 FPFX(me_search_sym10) +dist_t xavs2_me_search_sym10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *mv); +#define xavs2_me_search_bid8 FPFX(me_search_bid8) +dist_t xavs2_me_search_bid8(xavs2_t *h, xavs2_me_t *p_me, pel8_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc); +#define xavs2_me_search_bid10 FPFX(me_search_bid10) +dist_t xavs2_me_search_bid10(xavs2_t *h, xavs2_me_t *p_me, pel10_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc); #endif // XAVS2_ME_H diff --git a/source/encoder/parameters.c b/source/encoder/parameters.c index 9adec5b..13dea95 100644 --- a/source/encoder/parameters.c +++ b/source/encoder/parameters.c @@ -207,7 +207,7 @@ mapping_default(xavs2_param_map_t *p_map_tab, xavs2_param_t *p) MAP("ALFLowLatencyEncodingEnable", &p->alf_LowLatencyEncoding, MAP_NUM, "Enable Low Latency ALF (1=Low Latency mode, 0=High Efficiency mode)"); MAP("CrossSliceLoopFilter", &p->b_cross_slice_loop_filter, MAP_NUM, "Enable Cross Slice Boundary Filter (0=Disable, 1=Enable)"); - /* 场编码参数 */ + /* 鍦虹紪鐮佸弬鏁 */ // MAP("InterlaceCodingOption", &p->InterlaceCodingOption, MAP_NUM); // MAP("RepeatFirstField", &p->repeat_first_field, MAP_NUM); // MAP("TopFieldFirst", &p->top_field_first, MAP_NUM); @@ -425,7 +425,7 @@ int ParameterNameToMapIndex(xavs2_param_map_t *p_map_tab, const char *param_name mapping_t *map_tab = p_map_tab->map_tab; int i = 0; - while (map_tab[i].name[0] != '\0') { // 终止位置是空字符串 + while (map_tab[i].name[0] != '\0') { // 缁堟浣嶇疆鏄┖瀛楃涓 if (xavs2_param_match(map_tab[i].name, param_name)) { return i; } else { @@ -439,7 +439,7 @@ int ParameterNameToMapIndex(xavs2_param_map_t *p_map_tab, const char *param_name /* --------------------------------------------------------------------------- */ static INLINE -void get_param_name(char *name, const char *param_item) +void get_param_name(char *name, char *param_item) { char *str; name[0] = '\0'; @@ -629,7 +629,7 @@ xavs2_encoder_opt_set(xavs2_param_t *param, int argc, char *argv[]) int in_item = 0; int i; - if ((contents = xavs2_get_configs(argc, argv)) == NULL) { + if ((contents = xavs2_get_configs(argc, (const char * const *)argv)) == NULL) { fprintf(stderr, "get contents from configure file error."); return -1; } diff --git a/source/encoder/presets.c b/source/encoder/presets.c index dd6f5af..2143112 100644 --- a/source/encoder/presets.c +++ b/source/encoder/presets.c @@ -53,7 +53,7 @@ * =========================================================================== */ /* --------------------------------------------------------------------------- - * 帧内亮度块的RDO模式数量,对应不同preset档次 + * 甯у唴浜害鍧楃殑RDO妯″紡鏁伴噺锛屽搴斾笉鍚宲reset妗f */ static const uint8_t INTRA_FULL_RDO_NUM[][MAX_CU_SIZE_IN_BIT + 1] = { { 0, 0, 1, 1, 1, 1, 1 }, /* 0: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ @@ -69,13 +69,13 @@ static const uint8_t INTRA_FULL_RDO_NUM[][MAX_CU_SIZE_IN_BIT + 1] = { }; /* --------------------------------------------------------------------------- - * 帧内色度块 RDO 的最大模式数量 (不同preset档次) + * 甯у唴鑹插害鍧 RDO 鐨勬渶澶фā寮忔暟閲 (涓嶅悓preset妗f) */ static const int8_t tab_num_rdo_chroma_intra_mode[] = { 1, 2, 2, 2, 3, 3, 4, 4, 5, 5 }; -/* 帧内RMD搜索的阈值,步长为2和1搜索的角度数量 */ +/* 甯у唴RMD鎼滅储鐨勯槇鍊硷紝姝ラ暱涓2鍜1鎼滅储鐨勮搴︽暟閲 */ static const int8_t tab_num_angle_dist2[] = { 0, 0, 4, 4, 4, 4, 5, 5, 6, 6 }; @@ -84,14 +84,14 @@ static const int8_t tab_num_angle_dist1[] = { }; /* --------------------------------------------------------------------------- - * 全零块检测时的判定阈值倍率 + * 鍏ㄩ浂鍧楁娴嬫椂鐨勫垽瀹氶槇鍊煎嶇巼 */ static const float tab_th_zero_block_factor[] = { 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }; /* --------------------------------------------------------------------------- - * QSFD算法的阈值计算系数(不同preset) + * QSFD绠楁硶鐨勯槇鍊艰绠楃郴鏁帮紙涓嶅悓preset锛 */ const static double tab_qsfd_s_presets[][10] = { /* preset_level: @@ -103,21 +103,23 @@ const static double tab_qsfd_cu_size_weight[4] = { 0.25, 1.0, 3.0, 7.5 /* 8x8, 16x16, 32x32, 64x64 */ }; -double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH]; +//extern double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH]; /*-------------------------------------------------------------------------- */ static INLINE void algorithm_init_thresholds(xavs2_param_t *p_param) { + double tab_qsfd_thres[MAX_QP + (p_param->sample_bit_depth - 8) * 8][2][CTU_DEPTH]; int i_preset_level = p_param->preset_level; //trade-off encoding time and performance const double s_inter = tab_qsfd_s_presets[0][i_preset_level]; const double s_intra = tab_qsfd_s_presets[1][i_preset_level]; int i; + int max_qp = MAX_QP + (p_param->sample_bit_depth - 8) * 8; /* QSFD threasholds */ - for (i = 0; i < MAX_QP; i++) { + for (i = 0; i < max_qp; i++) { double qstep = 32768.0 / tab_Q_TAB[i]; double th_base = 350 * pow(qstep, 0.9); double th__8 = th_base * tab_qsfd_cu_size_weight[0]; @@ -140,7 +142,7 @@ void algorithm_init_thresholds(xavs2_param_t *p_param) tab_qsfd_thres[i][1][3] = th_64 * s_intra * 1.0; } - /* 全零块检测 */ + /* 鍏ㄩ浂鍧楁娴 */ p_param->factor_zero_block = tab_th_zero_block_factor[i_preset_level]; } @@ -164,7 +166,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) p_param->num_max_ref = XAVS2_MIN(i_preset_level, 4); } - /* --------------------------- CU结构 --------------------------- + /* --------------------------- CU缁撴瀯 --------------------------- | preset | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | +=================+=====+=====+=====+=====+=====+=====+======+======+======+======+ | ctu | 32 | 32 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | @@ -172,7 +174,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) */ p_param->lcu_bit_level = XAVS2_MIN(p_param->lcu_bit_level, 5 + (i_preset_level > 1)); - /* --------------------------- 预测 --------------------------- + /* --------------------------- 棰勬祴 --------------------------- */ p_param->inter_2pu = i_preset_level > 1; p_param->enable_amp = i_preset_level > 5; // NSQT @@ -183,17 +185,17 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) p_param->enable_dhp = i_preset_level > 7 && p_param->enable_f_frame; p_param->enable_dmh = i_preset_level > 6 && p_param->enable_f_frame; - /* --------------------------- 变换 --------------------------- */ + /* --------------------------- 鍙樻崲 --------------------------- */ p_param->enable_sdip = i_preset_level > 5; p_param->enable_nsqt = i_preset_level > 5; p_param->enable_secT = i_preset_level > -1; p_param->b_fast_2lelvel_tu = i_preset_level < 4; - /* --------------------------- 量化 --------------------------- + /* --------------------------- 閲忓寲 --------------------------- * Level: All for preset 9, Off for preset 0~2 */ p_param->i_rdoq_level = i_preset_level > 8 ? RDOQ_ALL : i_preset_level > 5 ? RDOQ_CU_LEVEL : RDOQ_OFF; - /* --------------------------- RDO档次 --------------------------- + /* --------------------------- RDO妗f --------------------------- */ if (i_preset_level < 0) { p_param->i_rd_level = RDO_OFF; @@ -205,7 +207,7 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) p_param->i_rd_level = RDO_ALL; } - /* --------------------------- 熵编码 --------------------------- + /* --------------------------- 鐔电紪鐮 --------------------------- */ if (i_preset_level <= 3) { p_param->rdo_bit_est_method = 2; @@ -215,13 +217,13 @@ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) p_param->rdo_bit_est_method = 0; } - /* --------------------------- 滤波 --------------------------- + /* --------------------------- 婊ゆ尝 --------------------------- */ p_param->enable_alf = p_param->enable_alf && i_preset_level > 4; p_param->enable_sao = p_param->enable_sao && i_preset_level > 1; - p_param->b_fast_sao = i_preset_level < 5; // 档次4以下开启快速SAO编码决策 + p_param->b_fast_sao = i_preset_level < 5; // 妗f4浠ヤ笅寮鍚揩閫烻AO缂栫爜鍐崇瓥 - /* --------------------------- 其他 --------------------------- + /* --------------------------- 鍏朵粬 --------------------------- */ p_param->enable_hadamard = i_preset_level > 0; p_param->enable_tdrdo = i_preset_level > 4 && p_param->enable_tdrdo; @@ -329,8 +331,8 @@ void encoder_set_fast_algorithms(xavs2_t *h) * 1, switch on some algorithms with little efficiency loss */ - /* 是否需要分像素运动搜索 - * 参考帧数量大于1个时,会出现MV的缩放而导致MV像素精度达到1/4 + /* 鏄惁闇瑕佸垎鍍忕礌杩愬姩鎼滅储 + * 鍙傝冨抚鏁伴噺澶т簬1涓椂锛屼細鍑虹幇MV鐨勭缉鏀捐屽鑷碝V鍍忕礌绮惧害杈惧埌1/4 */ if (i_preset_level < 2) { h->use_fractional_me = 1; @@ -355,16 +357,24 @@ void encoder_set_fast_algorithms(xavs2_t *h) } else { memcpy(h->tab_num_intra_rdo, INTRA_FULL_RDO_NUM[i_preset_level >> 0], sizeof(h->tab_num_intra_rdo)); } - /* RMD算法的搜索角度数量 */ + /* RMD绠楁硶鐨勬悳绱㈣搴︽暟閲 */ h->num_intra_rmd_dist2 = tab_num_angle_dist2[i_preset_level]; h->num_intra_rmd_dist1 = tab_num_angle_dist1[i_preset_level]; h->num_rdo_intra_chroma = tab_num_rdo_chroma_intra_mode[i_preset_level]; - /* 帧内预测模式 */ + /* 甯у唴棰勬祴妯″紡 */ + if (h->param->input_sample_bit_depth == 8) { if (IS_ALG_ENABLE(OPT_FAST_INTRA_MODE)) { - h->get_intra_candidates_luma = rdo_get_pred_intra_luma_rmd; + h->get_intra_candidates_luma8 = rdo_get_pred_intra_luma8_rmd; } else { - h->get_intra_candidates_luma = rdo_get_pred_intra_luma; + h->get_intra_candidates_luma8 = rdo_get_pred_intra_luma8; + } + } else { + if (IS_ALG_ENABLE(OPT_FAST_INTRA_MODE)) { + h->get_intra_candidates_luma10 = rdo_get_pred_intra_luma10_rmd; + } else { + h->get_intra_candidates_luma10 = rdo_get_pred_intra_luma10; + } } if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) { h->get_intra_candidates_chroma = rdo_get_pred_intra_chroma_fast; diff --git a/source/encoder/ratecontrol.c b/source/encoder/ratecontrol.c index f071a68..33bb596 100644 --- a/source/encoder/ratecontrol.c +++ b/source/encoder/ratecontrol.c @@ -187,10 +187,9 @@ static const double tab_qp_gpp[3][3] = { /* --------------------------------------------------------------------------- * compute the gradient per pixel */ -static double cal_frame_gradient(xavs2_frame_t *frm) +static double cal_frame_gradient(xavs2_t *h, xavs2_frame_t *frm) { double grad_per_pixel = 0; // gradient per pixel - pel_t *src = frm->planes[IMG_Y];// pointer to luma component int width = frm->i_width[IMG_Y]; int height = frm->i_lines[IMG_Y]; int stride = frm->i_stride[IMG_Y]; @@ -199,6 +198,24 @@ static double cal_frame_gradient(xavs2_frame_t *frm) width--; height--; + + if (h->param->input_sample_bit_depth == 8) { + pel8_t *src = frm->planes8[IMG_Y];// pointer to luma component + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int dx = src[j] - src[j + 1]; + int dy = src[j] - src[j + stride]; + + if (dx || dy) { + grad_per_pixel += sqrt((double)(dx * dx + dy * dy)); + } + } + src += stride; + } + + return grad_per_pixel / size; + } else { + pel10_t *src = frm->planes10[IMG_Y];// pointer to luma component for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { int dx = src[j] - src[j + 1]; @@ -212,6 +229,7 @@ static double cal_frame_gradient(xavs2_frame_t *frm) } return grad_per_pixel / size; + } } #endif @@ -341,7 +359,7 @@ static int rc_calculate_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int forc /* compute the initial qp */ if (frm_idx == 0) { double bit = log(1000 * rc->f_target_bpp); - double gpp = log(cal_frame_gradient(h->fenc)); + double gpp = log(cal_frame_gradient(h, h->fenc)); int idx = XAVS2_MIN(2, rc->i_intra_period); int max_i_qp = 63 + (h->param->sample_bit_depth - 8) * 8 - 10; @@ -617,7 +635,7 @@ int xavs2_rc_get_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int force_qp) */ int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp) { - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); UNUSED_PARAMETER(frm_idx); //if (h->param->i_rc_method == XAVS2_RC_CBR_SCU && img->current_mb_nr == 0) { @@ -691,7 +709,7 @@ int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp) */ void xavs2_rc_update_after_lcu_coded(xavs2_t *h, int frm_idx, int qp) { - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); UNUSED_PARAMETER(frm_idx); UNUSED_PARAMETER(qp); diff --git a/source/encoder/rdo.c b/source/encoder/rdo.c index 6bfbff1..d679600 100644 --- a/source/encoder/rdo.c +++ b/source/encoder/rdo.c @@ -38,7 +38,7 @@ #include "rdo.h" #include "cudata.h" #include "aec.h" -#include "common/mc.h" +#include "mc.h" #include "transform.h" #include "block_info.h" #include "wquant.h" @@ -58,8 +58,8 @@ /* --------------------------------------------------------------------------- */ static const float SUBCU_COST_RATE[2][4] = { - {0.50f, 0.75f, 0.97f, 1.0f}, /* 帧内CU的Cost一般都较大 */ - {0.75f, 0.90f, 0.99f, 1.0f}, /* 帧间情况下,Skip块Cost很小 */ + {0.50f, 0.75f, 0.97f, 1.0f}, /* 甯у唴CU鐨凜ost涓鑸兘杈冨ぇ */ + {0.75f, 0.90f, 0.99f, 1.0f}, /* 甯ч棿鎯呭喌涓嬶紝Skip鍧桟ost寰堝皬 */ }; static const int tab_pdir_bskip[DS_MAX_NUM] = { @@ -124,7 +124,7 @@ static const int8_t headerbits_skipmode[8] = { 2, 3, 4, 4, 3, 4, 5, 5 };//tempor */ /* --------------------------------------------------------------------------- - * 依据CU划分模式确定当前CU包含的PU数量和大小(帧间划分) + * 渚濇嵁CU鍒掑垎妯″紡纭畾褰撳墠CU鍖呭惈鐨凱U鏁伴噺鍜屽ぇ灏忥紙甯ч棿鍒掑垎锛 */ static ALWAYS_INLINE void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) @@ -137,7 +137,7 @@ void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) // set for each block if (i_mode == PRED_SKIP) { - ///! 一些特殊的Skip/Direct模式下如果CU超过8x8,则PU划分成4个 + ///! 涓浜涚壒娈婄殑Skip/Direct妯″紡涓嬪鏋淐U瓒呰繃8x8锛屽垯PU鍒掑垎鎴4涓 if (i_level > 3 && (h->i_type == SLICE_TYPE_P || (h->i_type == SLICE_TYPE_F && ds_mode == DS_NONE) || (h->i_type == SLICE_TYPE_B && ds_mode == DS_NONE))) { p_cu_info->num_pu = 4; @@ -158,7 +158,7 @@ void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) } /* --------------------------------------------------------------------------- - * 依据CU划分模式确定当前CU包含的PU数量和大小(帧内划分) + * 渚濇嵁CU鍒掑垎妯″紡纭畾褰撳墠CU鍖呭惈鐨凱U鏁伴噺鍜屽ぇ灏忥紙甯у唴鍒掑垎锛 */ static ALWAYS_INLINE void cu_init_pu_intra(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) @@ -208,31 +208,59 @@ void cu_init(xavs2_t *h, cu_t *p_cu, cu_info_t *best, int i_level) cu_layer_t *p_layer = cu_get_layer(h, i_level); int i; + if (h->param->input_sample_bit_depth == 8) { /* Ping-pong buffer */ - p_layer->buf_pred_inter = p_layer->buf_pred_inter_luma[0]; - p_layer->buf_pred_inter_best = p_layer->buf_pred_inter_luma[1]; + p_layer->buf_pred_inter8 = p_layer->buf_pred_inter_luma8[0]; + p_layer->buf_pred_inter8_best = p_layer->buf_pred_inter_luma8[1]; /* init rec and coeff pointer */ - p_cu->cu_info.p_rec [0] = p_layer->rec_buf_y [0]; + p_cu->cu_info.p_rec8 [0] = p_layer->rec8_buf_y [0]; p_cu->cu_info.p_coeff[0] = p_layer->coef_buf_y[0]; - p_layer->p_rec_tmp [0] = p_layer->rec_buf_y [1]; + p_layer->p_rec8_tmp [0] = p_layer->rec8_buf_y [1]; p_layer->p_coeff_tmp [0] = p_layer->coef_buf_y[1]; - best->p_rec [0] = p_layer->rec_buf_y [2]; + best->p_rec8 [0] = p_layer->rec8_buf_y [2]; best->p_coeff [0] = p_layer->coef_buf_y[2]; - p_cu->cu_info.p_rec [1] = p_layer->rec_buf_uv [0][0]; + p_cu->cu_info.p_rec8 [1] = p_layer->rec8_buf_uv [0][0]; p_cu->cu_info.p_coeff[1] = p_layer->coef_buf_uv[0][0]; - p_layer->p_rec_tmp [1] = p_layer->rec_buf_uv [0][1]; + p_layer->p_rec8_tmp [1] = p_layer->rec8_buf_uv [0][1]; p_layer->p_coeff_tmp [1] = p_layer->coef_buf_uv[0][1]; - best->p_rec [1] = p_layer->rec_buf_uv [0][2]; + best->p_rec8 [1] = p_layer->rec8_buf_uv [0][2]; best->p_coeff [1] = p_layer->coef_buf_uv[0][2]; - p_cu->cu_info.p_rec [2] = p_layer->rec_buf_uv [1][0]; + p_cu->cu_info.p_rec8 [2] = p_layer->rec8_buf_uv [1][0]; p_cu->cu_info.p_coeff[2] = p_layer->coef_buf_uv[1][0]; - p_layer->p_rec_tmp [2] = p_layer->rec_buf_uv [1][1]; + p_layer->p_rec8_tmp [2] = p_layer->rec8_buf_uv [1][1]; p_layer->p_coeff_tmp [2] = p_layer->coef_buf_uv[1][1]; - best->p_rec [2] = p_layer->rec_buf_uv [1][2]; + best->p_rec8 [2] = p_layer->rec8_buf_uv [1][2]; best->p_coeff [2] = p_layer->coef_buf_uv[1][2]; + } else { + /* Ping-pong buffer */ + p_layer->buf_pred_inter10 = p_layer->buf_pred_inter_luma10[0]; + p_layer->buf_pred_inter10_best = p_layer->buf_pred_inter_luma10[1]; + + /* init rec and coeff pointer */ + p_cu->cu_info.p_rec10 [0] = p_layer->rec10_buf_y [0]; + p_cu->cu_info.p_coeff[0] = p_layer->coef_buf_y[0]; + p_layer->p_rec10_tmp [0] = p_layer->rec10_buf_y [1]; + p_layer->p_coeff_tmp [0] = p_layer->coef_buf_y[1]; + best->p_rec10 [0] = p_layer->rec10_buf_y [2]; + best->p_coeff [0] = p_layer->coef_buf_y[2]; + + p_cu->cu_info.p_rec10 [1] = p_layer->rec10_buf_uv [0][0]; + p_cu->cu_info.p_coeff[1] = p_layer->coef_buf_uv[0][0]; + p_layer->p_rec10_tmp [1] = p_layer->rec10_buf_uv [0][1]; + p_layer->p_coeff_tmp [1] = p_layer->coef_buf_uv[0][1]; + best->p_rec10 [1] = p_layer->rec10_buf_uv [0][2]; + best->p_coeff [1] = p_layer->coef_buf_uv[0][2]; + + p_cu->cu_info.p_rec10 [2] = p_layer->rec10_buf_uv [1][0]; + p_cu->cu_info.p_coeff[2] = p_layer->coef_buf_uv[1][0]; + p_layer->p_rec10_tmp [2] = p_layer->rec10_buf_uv [1][1]; + p_layer->p_coeff_tmp [2] = p_layer->coef_buf_uv[1][1]; + best->p_rec10 [2] = p_layer->rec10_buf_uv [1][2]; + best->p_coeff [2] = p_layer->coef_buf_uv[1][2]; + } /* init basic properties */ p_cu->cu_info.i_cbp = 0; @@ -255,7 +283,7 @@ void cu_init(xavs2_t *h, cu_t *p_cu, cu_info_t *best, int i_level) } #endif - /* ref_idx_1st[], ref_idx_2nd[] 内存连续 */ + /* ref_idx_1st[], ref_idx_2nd[] 鍐呭瓨杩炵画 */ memset(p_cu->cu_info.ref_idx_1st, INVALID_REF, sizeof(p_cu->cu_info.ref_idx_1st) + sizeof(p_cu->cu_info.ref_idx_2nd)); /* init position for 4 sub-CUs */ @@ -299,9 +327,15 @@ void cu_store_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best) cu_copy_info(best, &p_cu->cu_info); /* --- reconstructed blocks ---- */ - XAVS2_SWAP_PTR(best->p_rec[0], p_cu->cu_info.p_rec[0]); - XAVS2_SWAP_PTR(best->p_rec[1], p_cu->cu_info.p_rec[1]); - XAVS2_SWAP_PTR(best->p_rec[2], p_cu->cu_info.p_rec[2]); + if (h->param->input_sample_bit_depth == 8) { + XAVS2_SWAP_PTR(best->p_rec8[0], p_cu->cu_info.p_rec8[0]); + XAVS2_SWAP_PTR(best->p_rec8[1], p_cu->cu_info.p_rec8[1]); + XAVS2_SWAP_PTR(best->p_rec8[2], p_cu->cu_info.p_rec8[2]); + } else { + XAVS2_SWAP_PTR(best->p_rec10[0], p_cu->cu_info.p_rec10[0]); + XAVS2_SWAP_PTR(best->p_rec10[1], p_cu->cu_info.p_rec10[1]); + XAVS2_SWAP_PTR(best->p_rec10[2], p_cu->cu_info.p_rec10[2]); + } /* ---- residual (coefficients) ---- */ XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]); @@ -391,19 +425,35 @@ void cu_copy_stored_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best) cu_copy_info(&p_cu->cu_info, best); //===== reconstruction values ===== - g_funcs.pixf.copy_pp[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE, - best->p_rec[0], FREC_STRIDE); - g_funcs.pixf.copy_ss[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize, + if (h->param->input_sample_bit_depth == 8) { + g_funcs.pixf.copy_pp8[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec8[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE, + best->p_rec8[0], FREC_STRIDE); + g_funcs.pixf.copy_ss8[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize, + best->p_coeff[0], blocksize); + + g_funcs.pixf.copy_pp8[part_idx_c](h->lcu.p_fdec8[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, + best->p_rec8[1], FREC_CSTRIDE / 2); + g_funcs.pixf.copy_pp8[part_idx_c](h->lcu.p_fdec8[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, + best->p_rec8[2], FREC_CSTRIDE / 2); + g_funcs.pixf.copy_ss8[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1, + best->p_coeff[1], blocksize >> 1); + g_funcs.pixf.copy_ss8[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1, + best->p_coeff[2], blocksize >> 1); + } else { + g_funcs.pixf.copy_pp10[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec10[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE, + best->p_rec10[0], FREC_STRIDE); + g_funcs.pixf.copy_ss10[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize, best->p_coeff[0], blocksize); - g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, - best->p_rec[1], FREC_CSTRIDE / 2); - g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, - best->p_rec[2], FREC_CSTRIDE / 2); - g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1, + g_funcs.pixf.copy_pp10[part_idx_c](h->lcu.p_fdec10[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, + best->p_rec10[1], FREC_CSTRIDE / 2); + g_funcs.pixf.copy_pp10[part_idx_c](h->lcu.p_fdec10[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, + best->p_rec10[2], FREC_CSTRIDE / 2); + g_funcs.pixf.copy_ss10[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1, best->p_coeff[1], blocksize >> 1); - g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1, + g_funcs.pixf.copy_ss10[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1, best->p_coeff[2], blocksize >> 1); + } //=============== cbp and mode =============== for (j = 0; j < size_in_scu; j++) { @@ -560,11 +610,11 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb) int b_available_TR = h->tab_avail_TR[(y_TR_4x4_in_lcu << (h->i_lcu_level - B4X4_IN_BIT)) + x_TR_4x4_in_lcu]; /* 2. get neighboring blocks */ - /* 左上 */ + /* 宸︿笂 */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPLEFT ], xx0 - 1, yy0 - 1); - /* 左邻的PU信息 */ - if (IS_VER_PU_PART(p_cu->cu_info.i_mode) && p_cb->x != 0) { // CU垂直划分为两个PU,且当前PU为右边一个 + /* 宸﹂偦鐨凱U淇℃伅 */ + if (IS_VER_PU_PART(p_cu->cu_info.i_mode) && p_cb->x != 0) { // CU鍨傜洿鍒掑垎涓轰袱涓狿U锛屼笖褰撳墠PU涓哄彸杈逛竴涓 neighbor_inter_t *p_neighbor = neighbors + BLK_LEFT; p_neighbor->is_available = 1; // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0); @@ -579,8 +629,8 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb) cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT2], xx0 - 1, yy1); } - /* 上邻的PU信息 */ - if (IS_HOR_PU_PART(p_cu->cu_info.i_mode) && p_cb->y != 0) { // CU水平划分为两个PU,且当前PU为下边一个 + /* 涓婇偦鐨凱U淇℃伅 */ + if (IS_HOR_PU_PART(p_cu->cu_info.i_mode) && p_cb->y != 0) { // CU姘村钩鍒掑垎涓轰袱涓狿U锛屼笖褰撳墠PU涓轰笅杈逛竴涓 neighbor_inter_t *p_neighbor = neighbors + BLK_TOP; p_neighbor->is_available = 1; // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0); @@ -595,7 +645,7 @@ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb) cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP2], xx1, yy0 - 1); } - /* 右上 */ + /* 鍙充笂 */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPRIGHT], b_available_TR ? xx1 + 1 : -1, yy0 - 1); cu_get_neighbor_temporal(h, &neighbors[BLK_COL], xx0, yy0); @@ -612,9 +662,9 @@ int cu_get_mvs_for_mc(xavs2_t *h, cu_t *p_cu, int pu_idx, { int num_ref; // number of reference frames int dmh_mode = p_cu->cu_info.dmh_mode; - int ref_1st = p_cu->cu_info.ref_idx_1st[pu_idx]; // 第一(前向或者B帧单向预测)运动矢量 - int ref_2nd = p_cu->cu_info.ref_idx_2nd[pu_idx]; // 第二(B帧双向的后向) - mv_t mv_1st, mv_2nd; // 第一(前向或者B帧单向预测)和第二(后向)运动矢量 + int ref_1st = p_cu->cu_info.ref_idx_1st[pu_idx]; // 绗竴锛堝墠鍚戞垨鑰匓甯у崟鍚戦娴嬶級杩愬姩鐭㈤噺 + int ref_2nd = p_cu->cu_info.ref_idx_2nd[pu_idx]; // 绗簩锛圔甯у弻鍚戠殑鍚庡悜锛 + mv_t mv_1st, mv_2nd; // 绗竴锛堝墠鍚戞垨鑰匓甯у崟鍚戦娴嬶級鍜岀浜岋紙鍚庡悜锛夎繍鍔ㄧ煝閲 if (h->i_type != SLICE_TYPE_B) { num_ref = (ref_1st != INVALID_REF) + (ref_2nd != INVALID_REF); @@ -764,9 +814,9 @@ static INLINE void tu_get_dct_coeff(xavs2_t *h, coeff_t *cur_blk, int pu_size_idx, int bsx, int bsy) { if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && !h->lcu.b_2nd_rdcost_pass && bsx >= 32 && bsy >= 32) { - g_funcs.dctf.dct_half[pu_size_idx](cur_blk, cur_blk, bsx); + g_funcs.dctf.dct_half[pu_size_idx](h, cur_blk, cur_blk, bsx); } else { - g_funcs.dctf.dct[pu_size_idx](cur_blk, cur_blk, bsx); + g_funcs.dctf.dct[pu_size_idx](h, cur_blk, cur_blk, bsx); } } @@ -796,20 +846,67 @@ static int cu_recon_chroma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t *distort int uv; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); coeff_t *cur_blk = p_enc->coeff_blk; - pel_t *p_pred; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_pred; + + /* prediction buffer of chroma blocks */ + if (b_intra) { + p_pred = p_enc->intra8_pred_c[p_cu->cu_info.i_intra_mode_c]; + } else { + p_pred = p_enc->buf_pred_inter8_c; + } + + for (uv = 0; uv < 2; uv++) { + pel8_t *p_fdec = p_cu->cu_info.p_rec8[uv + 1]; + pel8_t *p_fenc = h->lcu.p_fenc8[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c; + + g_funcs.pixf.sub_ps8[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE); + + // DCT, quantization, inverse quantization, IDCT, and reconstruction + tu_get_dct_coeff(h, cur_blk, partidx_c, bsize_c, bsize_c); + + qp_c = cu_get_qp(h, &p_cu->cu_info); +#if ENABLE_WQUANT + qp_c += (uv == 0 ? h->param->chroma_quant_param_delta_u : h->param->chroma_quant_param_delta_v); +#endif + + qp_c = cu_get_chroma_qp(h, qp_c, uv); + + num_nonzero = tu_quant_forward(h, p_aec, p_cu, cur_blk, level_c, bsize_c, bsize_c, qp_c, b_intra, 0, DC_PRED); + cbp_c |= (num_nonzero != 0) << (4 + uv); + + if (num_nonzero) { + g_funcs.pixf.copy_ss8[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c); + + tu_quant_inverse(h, p_cu, cur_blk, bsize_c * bsize_c, level_c, qp_c, 0); + g_funcs.dctf.idct[partidx_c](h, cur_blk, cur_blk, bsize_c); + + g_funcs.pixf.add_ps8[partidx_c](h, p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c); + } else { + g_funcs.pixf.copy_pp8[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE); + } + + *distortion += g_funcs.pixf.ssd8[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + + p_pred += (FREC_CSTRIDE >> 1); // uvoffset + } + + return cbp_c; + } else { + pel10_t *p_pred; /* prediction buffer of chroma blocks */ if (b_intra) { - p_pred = p_enc->intra_pred_c[p_cu->cu_info.i_intra_mode_c]; + p_pred = p_enc->intra10_pred_c[p_cu->cu_info.i_intra_mode_c]; } else { - p_pred = p_enc->buf_pred_inter_c; + p_pred = p_enc->buf_pred_inter10_c; } for (uv = 0; uv < 2; uv++) { - pel_t *p_fdec = p_cu->cu_info.p_rec[uv + 1]; - pel_t *p_fenc = h->lcu.p_fenc[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c; + pel10_t *p_fdec = p_cu->cu_info.p_rec10[uv + 1]; + pel10_t *p_fenc = h->lcu.p_fenc10[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c; - g_funcs.pixf.sub_ps[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE); + g_funcs.pixf.sub_ps10[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE); // DCT, quantization, inverse quantization, IDCT, and reconstruction tu_get_dct_coeff(h, cur_blk, partidx_c, bsize_c, bsize_c); @@ -825,22 +922,23 @@ static int cu_recon_chroma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t *distort cbp_c |= (num_nonzero != 0) << (4 + uv); if (num_nonzero) { - g_funcs.pixf.copy_ss[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c); + g_funcs.pixf.copy_ss10[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c); tu_quant_inverse(h, p_cu, cur_blk, bsize_c * bsize_c, level_c, qp_c, 0); - g_funcs.dctf.idct[partidx_c](cur_blk, cur_blk, bsize_c); + g_funcs.dctf.idct[partidx_c](h, cur_blk, cur_blk, bsize_c); - g_funcs.pixf.add_ps[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c); + g_funcs.pixf.add_ps10[partidx_c](h, p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c); } else { - g_funcs.pixf.copy_pp[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE); + g_funcs.pixf.copy_pp10[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE); } - *distortion += g_funcs.pixf.ssd[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + *distortion += g_funcs.pixf.ssd10[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); p_pred += (FREC_CSTRIDE >> 1); // uvoffset } return cbp_c; + } } /* --------------------------------------------------------------------------- @@ -853,9 +951,9 @@ int rdo_get_left_bits(xavs2_t *h, rdcost_t min_rdcost, dist_t distortion) double f_left_bits = ((min_rdcost - distortion) * h->f_lambda_1th) + 1; int left_bits; - left_bits = (int)XAVS2_CLIP3F(0.0f, 32766.0f, f_left_bits); // clip到一个合理的区间内 + left_bits = (int)XAVS2_CLIP3F(0.0f, 32766.0f, f_left_bits); // clip鍒颁竴涓悎鐞嗙殑鍖洪棿鍐 if (left_bits * f_lambda + distortion <= min_rdcost) { - left_bits++; // 避免浮点数运算误差,保证比特数达到该值时rdcost大于min_rdcost + left_bits++; // 閬垮厤娴偣鏁拌繍绠楄宸紝淇濊瘉姣旂壒鏁拌揪鍒拌鍊兼椂rdcost澶т簬min_rdcost } return left_bits; @@ -873,7 +971,81 @@ int rdo_get_left_bits(xavs2_t *h, rdcost_t min_rdcost, dist_t distortion) * and reconstruction pixel generation of a intra luma block */ static INLINE -int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int bsx, int bsy, +int cu_recon_intra_luma8(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel8_t *p_pred, int bsx, int bsy, + int block_x, int block_y, int idx_tu, int intra_pred_mode, dist_t *distortion) +{ + int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); + int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON); + int pos_x = p_cu->i_pos_x + block_x; + int pos_y = p_cu->i_pos_y + block_y; + int part_idx = PART_INDEX(bsx, bsy); + int w_tr = bsx >> used_wavelet; + int h_tr = bsy >> used_wavelet; + int num_non_zero; + int b_2nd_trans = h->param->enable_secT; + cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); + pel8_t *p_fenc = h->lcu.p_fenc8[0] + pos_y * FENC_STRIDE + pos_x; + pel8_t *p_fdec = p_cu->cu_info.p_rec8[0] + block_y * FREC_STRIDE + block_x; + coeff_t *cur_blk = p_enc->coeff_blk; + coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (idx_tu << ((p_cu->cu_info.i_level - 1) << 1)); + int b_top = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_TOP); + int b_left = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_LEFT); + + // get prediction and prediction error + g_funcs.pixf.sub_ps8[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx); + + // block transform + if (part_idx == LUMA_4x4) { + if (b_2nd_trans) { + g_funcs.dctf.transform_4x4_2nd(h, cur_blk, w_tr); + } else { + g_funcs.dctf.dct[LUMA_4x4](h, cur_blk, cur_blk, 4); /* 4x4 dct */ + } + } else { + tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); + + if (b_2nd_trans) { + g_funcs.dctf.transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left); + } + } + + // quantization + num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_tu_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 1, 1, intra_pred_mode); + + if (num_non_zero) { + g_funcs.pixf.copy_ss8[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr); + + // inverse quantization + tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_tu_level, cu_get_qp(h, &p_cu->cu_info), 1); + + // inverse transform + if (part_idx == LUMA_4x4) { + if (b_2nd_trans) { + g_funcs.dctf.inv_transform_4x4_2nd(h, cur_blk, w_tr); + } else { + g_funcs.dctf.idct[LUMA_4x4](h, cur_blk, cur_blk, 4); /* 4x4 idct */ + } + } else { + if (b_2nd_trans) { + g_funcs.dctf.inv_transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left); + } + + g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr); + } + + g_funcs.pixf.add_ps8[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx); + } else { + g_funcs.pixf.copy_pp8[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx); + } + + // get distortion (SSD) of current block + *distortion = g_funcs.pixf.ssd8[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + + return num_non_zero; +} + +static INLINE +int cu_recon_intra_luma10(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel10_t *p_pred, int bsx, int bsy, int block_x, int block_y, int idx_tu, int intra_pred_mode, dist_t *distortion) { int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); @@ -886,22 +1058,22 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int int num_non_zero; int b_2nd_trans = h->param->enable_secT; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); - pel_t *p_fenc = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x; - pel_t *p_fdec = p_cu->cu_info.p_rec[0] + block_y * FREC_STRIDE + block_x; + pel10_t *p_fenc = h->lcu.p_fenc10[0] + pos_y * FENC_STRIDE + pos_x; + pel10_t *p_fdec = p_cu->cu_info.p_rec10[0] + block_y * FREC_STRIDE + block_x; coeff_t *cur_blk = p_enc->coeff_blk; coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (idx_tu << ((p_cu->cu_info.i_level - 1) << 1)); int b_top = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_TOP); int b_left = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_LEFT); // get prediction and prediction error - g_funcs.pixf.sub_ps[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx); + g_funcs.pixf.sub_ps10[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx); // block transform if (part_idx == LUMA_4x4) { if (b_2nd_trans) { - g_funcs.dctf.transform_4x4_2nd(cur_blk, w_tr); + g_funcs.dctf.transform_4x4_2nd(h, cur_blk, w_tr); } else { - g_funcs.dctf.dct[LUMA_4x4](cur_blk, cur_blk, 4); /* 4x4 dct */ + g_funcs.dctf.dct[LUMA_4x4](h, cur_blk, cur_blk, 4); /* 4x4 dct */ } } else { tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); @@ -915,7 +1087,7 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_tu_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 1, 1, intra_pred_mode); if (num_non_zero) { - g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr); + g_funcs.pixf.copy_ss10[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr); // inverse quantization tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_tu_level, cu_get_qp(h, &p_cu->cu_info), 1); @@ -923,25 +1095,25 @@ int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int // inverse transform if (part_idx == LUMA_4x4) { if (b_2nd_trans) { - g_funcs.dctf.inv_transform_4x4_2nd(cur_blk, w_tr); + g_funcs.dctf.inv_transform_4x4_2nd(h, cur_blk, w_tr); } else { - g_funcs.dctf.idct[LUMA_4x4](cur_blk, cur_blk, 4); /* 4x4 idct */ + g_funcs.dctf.idct[LUMA_4x4](h, cur_blk, cur_blk, 4); /* 4x4 idct */ } } else { if (b_2nd_trans) { g_funcs.dctf.inv_transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left); } - g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr); + g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr); } - g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx); + g_funcs.pixf.add_ps10[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx); } else { - g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx); + g_funcs.pixf.copy_pp10[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx); } // get distortion (SSD) of current block - *distortion = g_funcs.pixf.ssd[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + *distortion = g_funcs.pixf.ssd10[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); return num_non_zero; } @@ -992,7 +1164,7 @@ void xavs2_get_mpms(xavs2_t *h, cu_t *p_cu, int blockidx, int pos_y, int pos_x, /* --------------------------------------------------------------------------- - * 检查帧内PU划分方式的RDCost并更新最优的PU划分方式 + * 妫鏌ュ抚鍐匬U鍒掑垎鏂瑰紡鐨凴DCost骞舵洿鏂版渶浼樼殑PU鍒掑垎鏂瑰紡 */ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best, int mode, rdcost_t *min_rdcost) { @@ -1002,8 +1174,6 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best rdcost_t rdcost_luma = 0; rdcost_t rdcost = MAX_COST; rdcost_t min_mode_rdcost = MAX_COST; - pel_t *rec_bak_y = best->p_rec[0]; - pel_t *p_best_part[4]; int blockidx; int num_luma_block = mode != PRED_I_2Nx2N ? 4 : 1; int b_need_swap_buf = 0; @@ -1011,10 +1181,10 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best int pix_y_c = p_cu->i_pos_y >> CHROMA_V_SHIFT; intra_candidate_t *p_candidates = p_layer->intra_candidates; - /* 确定PU划分类型 */ + /* 纭畾PU鍒掑垎绫诲瀷 */ cu_init_pu_intra(h, &p_cu->cu_info, level, mode); - /* 确定TU划分类型 */ + /* 纭畾TU鍒掑垎绫诲瀷 */ cu_set_tu_split_type(h, &p_cu->cu_info, mode != PRED_I_2Nx2N); h->copy_aec_state_rdo(&p_layer->cs_rdo, p_aec); @@ -1022,6 +1192,9 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best p_cu->intra_avail = (uint8_t)xavs2_intra_get_cu_neighbors(h, p_cu, p_cu->i_pix_x, p_cu->i_pix_y, p_cu->i_size); + if (h->param->input_sample_bit_depth == 8) { + pel8_t *rec_bak_y = best->p_rec8[0]; + pel8_t *p_best_part[4]; /* 1, intra luma prediction and mode decision */ for (blockidx = 0; blockidx < num_luma_block; blockidx++) { int mpm[2]; // most probable modes (MPMs) for current luma block @@ -1037,7 +1210,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best int best_mode = 0; int best_pmode = 0; int best_cbp = 0; - pel_t *p_fenc = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x; + pel8_t *p_fenc = h->lcu.p_fenc8[0] + pos_y * FENC_STRIDE + pos_x; rdcost_t best_rdcost = MAX_COST; int i; int num_for_rdo; @@ -1052,7 +1225,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } /* conduct prediction and get intra prediction direction candidates for RDO */ - num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma(h, p_cu, p_candidates, p_fenc, mpm, blockidx, + num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma8(h, p_cu, p_candidates, p_fenc, mpm, blockidx, block_x, block_y, block_w, block_h); // store the coding state @@ -1061,16 +1234,16 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best /* RDO */ for (i = 0; i < num_for_rdo; i++) { //rdcost_t rdcost; - dist_t dist_curr; // 当前亮度帧内块的失真 - int rate_curr = 0; // 当前亮度帧内块的码率(比特数) + dist_t dist_curr; // 褰撳墠浜害甯у唴鍧楃殑澶辩湡 + int rate_curr = 0; // 褰撳墠浜害甯у唴鍧楃殑鐮佺巼锛堟瘮鐗规暟锛 int Mode = p_candidates[i].mode; - pel_t *p_pred = p_enc->intra_pred[Mode]; + pel8_t *p_pred = p_enc->intra8_pred[Mode]; // get and check rate_chroma-distortion cost int mode_idx_aec = (mpm[0] == Mode) ? -2 : ((mpm[1] == Mode) ? -1 : (mpm[0] > Mode ? Mode : (mpm[1] > Mode ? Mode - 1 : Mode - 2))); int num_nonzero; - num_nonzero = cu_recon_intra_luma(h, p_aec, p_cu, p_pred, + num_nonzero = cu_recon_intra_luma8(h, p_aec, p_cu, p_pred, block_w, block_h, block_x, block_y, blockidx, Mode, &dist_curr); num_nonzero = !!num_nonzero; @@ -1098,7 +1271,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best // choose best mode if (rdcost < best_rdcost) { - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); // set best mode update minimum cost @@ -1123,14 +1296,14 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best /* change the coding state to BEST */ if (best_rate < INT_MAX) { if (p_cu->cu_info.i_mode != PRED_I_2Nx2N) { - g_funcs.pixf.copy_pp[PART_INDEX(block_w, block_h)](h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE, - p_layer->p_rec_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE); + g_funcs.pixf.copy_pp8[PART_INDEX(block_w, block_h)](h->lcu.p_fdec8[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE, + p_layer->p_rec8_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE); } /* copy coefficients and reconstructed data for best mode */ - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); - p_best_part[blockidx] = p_cu->cu_info.p_rec[0]; + p_best_part[blockidx] = p_cu->cu_info.p_rec8[0]; /* set intra mode prediction */ p_cu->cu_info.pred_intra_modes[blockidx] = (int8_t)best_pmode; @@ -1140,29 +1313,29 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); } - /* 保存最优模式的状态:失真、亮度分量比特数(排除掉亮度预测模式),CBP */ + /* 淇濆瓨鏈浼樻ā寮忕殑鐘舵侊細澶辩湡銆佷寒搴﹀垎閲忔瘮鐗规暟锛堟帓闄ゆ帀浜害棰勬祴妯″紡锛夛紝CBP */ rdcost_luma += best_dist + h->f_lambda_mode * best_rate; p_cu->cu_info.i_cbp |= (best_cbp) << blockidx; - /* 亮度块RDO的提前终止 */ + /* 浜害鍧桼DO鐨勬彁鍓嶇粓姝 */ if (rdcost_luma >= *min_rdcost) { p_layer->mode_rdcost[mode] = MAX_COST; /* set the cost for SDIP fast algorithm */ h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); - return; // 亮度块的最优rdcost已经超过当前最优值,停止后续色度块的模式遍历 + return; // 浜害鍧楃殑鏈浼榬dcost宸茬粡瓒呰繃褰撳墠鏈浼樺硷紝鍋滄鍚庣画鑹插害鍧楃殑妯″紡閬嶅巻 } } p_cu->feature.rdcost_luma = rdcost_luma; /* 2, store best luma reconstruction pixels */ for (blockidx = 0; blockidx < num_luma_block; blockidx++) { - if (p_best_part[blockidx] != p_cu->cu_info.p_rec[0]) { + if (p_best_part[blockidx] != p_cu->cu_info.p_rec8[0]) { int offset = p_cu->cu_info.cb[blockidx].y * FREC_STRIDE + p_cu->cu_info.cb[blockidx].x; int offset_coeff = blockidx << ((p_cu->cu_info.i_level - 1) << 1); int w_tr = p_cu->cu_info.cb[0].w; int h_tr = p_cu->cu_info.cb[0].h; int part_idx = PART_INDEX(w_tr, h_tr); - g_funcs.pixf.copy_pp[part_idx](p_cu->cu_info.p_rec[0] + offset, FREC_STRIDE, p_layer->p_rec_tmp[0] + offset, p_cu->i_size); - g_funcs.pixf.copy_ss[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr); + g_funcs.pixf.copy_pp8[part_idx](p_cu->cu_info.p_rec8[0] + offset, FREC_STRIDE, p_layer->p_rec8_tmp[0] + offset, p_cu->i_size); + g_funcs.pixf.copy_ss8[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr); } } @@ -1177,13 +1350,13 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best num_rdo_chroma_mode = h->get_intra_candidates_chroma(h, p_cu, level - 1, pix_y_c, pix_x_c, p_candidates); for (idx_chroma_mode = 0; idx_chroma_mode < num_rdo_chroma_mode; idx_chroma_mode++) { - dist_t dist_chroma = 0; // 色度块的指针 + dist_t dist_chroma = 0; // 鑹插害鍧楃殑鎸囬拡 int rate_chroma = 0; int bits_left; int predmode_c = p_candidates[idx_chroma_mode].mode; int cbp_c; - /* 跳过色度分量第二次调用过程中的模式选择,直接选到最优模式完成RDOQ */ + /* 璺宠繃鑹插害鍒嗛噺绗簩娆¤皟鐢ㄨ繃绋嬩腑鐨勬ā寮忛夋嫨锛岀洿鎺ラ夊埌鏈浼樻ā寮忓畬鎴怰DOQ */ if ((h->param->i_rdoq_level == RDOQ_CU_LEVEL && h->lcu.b_enable_rdoq) && predmode_c != best->i_intra_mode_c) { continue; } @@ -1192,7 +1365,7 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } p_cu->cu_info.i_intra_mode_c = (int8_t)predmode_c; - /* 完成RDO过程的色度块的重构过程(变换、量化、反变换反量化及求重构值) */ + /* 瀹屾垚RDO杩囩▼鐨勮壊搴﹀潡鐨勯噸鏋勮繃绋嬶紙鍙樻崲銆侀噺鍖栥佸弽鍙樻崲鍙嶉噺鍖栧強姹傞噸鏋勫硷級 */ cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma); p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp_luma + cbp_c); @@ -1257,153 +1430,517 @@ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); /* revert to initial AEC context */ /* 4, confirm the buffer pointers and record the best information */ - if (best->p_rec[0] == rec_bak_y && b_need_swap_buf) { - XAVS2_SWAP_PTR(best->p_rec[0], p_cu->cu_info.p_rec[0]); + if (best->p_rec8[0] == rec_bak_y && b_need_swap_buf) { + XAVS2_SWAP_PTR(best->p_rec8[0], p_cu->cu_info.p_rec8[0]); XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]); } p_layer->mode_rdcost[mode] = min_mode_rdcost; /* store the cost for SDIP fast algorithm */ -} + } else { + pel10_t *rec_bak_y = best->p_rec10[0]; + pel10_t *p_best_part[4]; + /* 1, intra luma prediction and mode decision */ + for (blockidx = 0; blockidx < num_luma_block; blockidx++) { + int mpm[2]; // most probable modes (MPMs) for current luma block + int block_x = p_cu->cu_info.cb[blockidx].x; + int block_y = p_cu->cu_info.cb[blockidx].y; + int block_w = p_cu->cu_info.cb[blockidx].w; + int block_h = p_cu->cu_info.cb[blockidx].h; + int pos_x = p_cu->i_pos_x + block_x; + int pos_y = p_cu->i_pos_y + block_y; + int b4x4_x = (p_cu->i_pix_x + block_x) >> MIN_PU_SIZE_IN_BIT; + dist_t best_dist = MAX_DISTORTION; + int best_rate = INT_MAX; + int best_mode = 0; + int best_pmode = 0; + int best_cbp = 0; + pel10_t *p_fenc = h->lcu.p_fenc10[0] + pos_y * FENC_STRIDE + pos_x; + rdcost_t best_rdcost = MAX_COST; + int i; + int num_for_rdo; + p_candidates = p_layer->intra_candidates; // candidate list, reserving the cost -//#if OPT_BYPASS_SDIP -/* --------------------------------------------------------------------------- - * SDIP fast - */ -static ALWAYS_INLINE int sdip_early_bypass(xavs2_t *h, cu_layer_t *p_layer, int i_mode) -{ - UNUSED_PARAMETER(h); - return i_mode == PRED_I_nx2N && (p_layer->mode_rdcost[PRED_I_2Nxn] < p_layer->mode_rdcost[PRED_I_2Nx2N] * 0.9); -} -//#endif + /* init */ + xavs2_get_mpms(h, p_cu, blockidx, pos_y, b4x4_x, mpm); -/** - * =========================================================================== - * local function defines (inter) - * =========================================================================== - */ + for (i = 0; i < INTRA_MODE_NUM_FOR_RDO; i++) { + p_candidates[i].mode = 0; + p_candidates[i].cost = MAX_COST; + } -//#if OPT_FAST_ZBLOCK || OPT_ECU -static const int tab_th_zero_block_sad[][5] = { - { 7, 19, 72, 281, 1115 }, { 7, 19, 73, 281, 1116 }, { 7, 20, 73, 282, 1118 }, - { 8, 20, 74, 283, 1120 }, { 8, 20, 74, 284, 1122 }, { 8, 20, 75, 285, 1124 }, - { 8, 21, 75, 286, 1126 }, { 8, 21, 76, 288, 1129 }, { 9, 21, 77, 289, 1132 }, - { 9, 22, 77, 291, 1135 }, { 9, 22, 78, 292, 1138 }, { 10, 23, 79, 294, 1142 }, - { 10, 23, 80, 296, 1146 }, { 10, 24, 81, 298, 1150 }, { 11, 24, 82, 301, 1155 }, - { 11, 25, 84, 303, 1160 }, { 12, 26, 85, 306, 1166 }, { 12, 26, 87, 309, 1172 }, - { 13, 27, 88, 312, 1179 }, { 13, 28, 90, 316, 1186 }, { 14, 29, 92, 320, 1194 }, - { 15, 30, 94, 325, 1203 }, { 15, 31, 97, 329, 1213 }, { 16, 33, 99, 334, 1223 }, - { 17, 34, 102, 340, 1235 }, { 18, 36, 105, 346, 1247 }, { 20, 37, 109, 353, 1260 }, - { 21, 39, 112, 360, 1275 }, { 22, 41, 116, 368, 1292 }, { 24, 43, 121, 377, 1309 }, - { 25, 46, 125, 386, 1328 }, { 27, 48, 131, 397, 1349 }, { 29, 51, 136, 408, 1372 }, - { 31, 54, 142, 420, 1397 }, { 33, 58, 149, 434, 1424 }, { 36, 61, 156, 448, 1453 }, - { 38, 65, 164, 464, 1485 }, { 41, 70, 173, 482, 1520 }, { 45, 74, 183, 501, 1559 }, - { 48, 79, 193, 521, 1600 }, { 52, 85, 204, 544, 1646 }, { 56, 91, 217, 569, 1696 }, - { 61, 98, 230, 596, 1750 }, { 66, 105, 245, 625, 1809 }, { 71, 113, 261, 657, 1873 }, - { 77, 122, 278, 692, 1944 }, { 83, 132, 297, 729, 2020 }, { 90, 142, 318, 771, 2104 }, - { 98, 153, 341, 816, 2195 }, { 106, 166, 365, 865, 2294 }, { 116, 179, 392, 919, 2403 }, - { 126, 194, 422, 978, 2521 }, { 136, 210, 454, 1042, 2649 }, { 148, 227, 488, 1111, 2790 }, - { 161, 246, 526, 1187, 2943 }, { 175, 267, 568, 1270, 3110 }, { 191, 290, 613, 1360, 3292 }, - { 207, 314, 662, 1459, 3491 }, { 225, 341, 716, 1566, 3707 }, { 245, 370, 775, 1683, 3944 }, - { 267, 402, 839, 1811, 4201 }, { 291, 437, 909, 1950, 4482 }, { 316, 475, 985, 2102, 4788 }, - { 345, 517, 1068, 2268, 5123 }, { 375, 562, 1158, 2448, 5487 }, { 412, 617, 1268, 2667, 5928 }, - { 445, 665, 1364, 2860, 6317 }, { 485, 724, 1482, 3094, 6790 }, { 528, 788, 1610, 3350, 7305 }, - { 576, 858, 1749, 3628, 7867 }, { 631, 939, 1912, 3954, 8524 }, { 687, 1022, 2078, 4285, 9192 }, - { 748, 1113, 2259, 4647, 9920 }, { 812, 1206, 2446, 5019, 10671 }, { 884, 1313, 2661, 5448, 11537 }, - { 964, 1431, 2895, 5917, 12482 }, { 1047, 1553, 3140, 6406, 13469 }, { 1145, 1698, 3430, 6985, 14636 }, - { 1248, 1850, 3735, 7592, 15862 }, { 1357, 2011, 4055, 8233, 17154 } -}; + /* conduct prediction and get intra prediction direction candidates for RDO */ + num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma10(h, p_cu, p_candidates, p_fenc, mpm, blockidx, + block_x, block_y, block_w, block_h); -/* --------------------------------------------------------------------------- - */ -static ALWAYS_INLINE -bool_t isZeroCuFast(xavs2_t *h, cu_t *p_cu) -{ - int i_level = p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT; - int i_qp = cu_get_qp(h, &p_cu->cu_info); - int thres_satd = (int)(tab_th_zero_block_sad[i_qp][i_level] * h->param->factor_zero_block); + // store the coding state + h->copy_aec_state_rdo(&p_enc->cs_pu_init, p_aec); - return p_cu->sum_satd < thres_satd; -} -//#endif + /* RDO */ + for (i = 0; i < num_for_rdo; i++) { + //rdcost_t rdcost; + dist_t dist_curr; // 褰撳墠浜害甯у唴鍧楃殑澶辩湡 + int rate_curr = 0; // 褰撳墠浜害甯у唴鍧楃殑鐮佺巼锛堟瘮鐗规暟锛 + int Mode = p_candidates[i].mode; + pel10_t *p_pred = p_enc->intra10_pred[Mode]; -/* --------------------------------------------------------------------------- - * int scrFlag = 0; // 0=noSCR, 1=strongSCR, 2=jmSCR - */ -static INLINE int -tu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, - int i_level, int8_t *cbp, int blockidx, coeff_t *cur_blk, - int x_pu, int y_pu, int w_pu, int h_pu) -{ - cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); - int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); - int part_idx = PART_INDEX(w_pu, h_pu); - int w_tr = w_pu >> used_wavelet; - int h_tr = h_pu >> used_wavelet; - int num_non_zero = 0; - pel_t *p_fdec = p_cu->cu_info.p_rec[0] + y_pu * FREC_STRIDE + x_pu; - pel_t *p_pred = p_layer->buf_pred_inter + y_pu * FREC_STRIDE + x_pu; - coeff_t *coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1)); + // get and check rate_chroma-distortion cost + int mode_idx_aec = (mpm[0] == Mode) ? -2 : ((mpm[1] == Mode) ? -1 : (mpm[0] > Mode ? Mode : (mpm[1] > Mode ? Mode - 1 : Mode - 2))); + int num_nonzero; - tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); + num_nonzero = cu_recon_intra_luma10(h, p_aec, p_cu, p_pred, + block_w, block_h, block_x, block_y, + blockidx, Mode, &dist_curr); + num_nonzero = !!num_nonzero; + { + int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); + int w_tr = block_w >> used_wavelet; + int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON) - used_wavelet; + int rate_luma_mode; + coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1)); - num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_level, w_tr, h_tr, - cu_get_qp(h, &p_cu->cu_info), 0, 1, DC_PRED); + // get rate for intra prediction mode + rate_luma_mode = p_aec->binary.write_intra_pred_mode(p_aec, mode_idx_aec); - if (num_non_zero != 0) { - *cbp |= (1 << blockidx); // 指定位设置为 1 - g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr); + // get rate for luminance coefficients + if (num_nonzero) { + int bits_left = rdo_get_left_bits(h, best_rdcost, dist_curr) - rate_luma_mode; + rate_curr = p_aec->binary.est_luma_block_coeff(h, p_aec, p_cu, p_coeff_y, &p_enc->runlevel, i_tu_level, xavs2_log2u(w_tr), + 1, Mode, bits_left); + rate_luma_mode += rate_curr; + } - tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1); - g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr); + // calculate RD-cost and return it + rdcost = dist_curr + h->f_lambda_mode * rate_luma_mode; + } - g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu); + // choose best mode + if (rdcost < best_rdcost) { + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + + // set best mode update minimum cost + best_dist = dist_curr; + best_rate = rate_curr; + best_rdcost = rdcost; + best_mode = Mode; + best_pmode = mode_idx_aec; + best_cbp = num_nonzero; // flag if dct-coefficients must be coded + h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); + } + + h->copy_aec_state_rdo(p_aec, &p_enc->cs_pu_init); + + if (IS_ALG_ENABLE(OPT_ET_RDO_INTRA_L)) { + if (rdcost > best_rdcost * 1.2) { + break; + } + } + } // for (i = 0; i < num_for_rdo; i++) + + /* change the coding state to BEST */ + if (best_rate < INT_MAX) { + if (p_cu->cu_info.i_mode != PRED_I_2Nx2N) { + g_funcs.pixf.copy_pp10[PART_INDEX(block_w, block_h)](h->lcu.p_fdec10[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE, + p_layer->p_rec10_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE); + } + + /* copy coefficients and reconstructed data for best mode */ + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + p_best_part[blockidx] = p_cu->cu_info.p_rec10[0]; + + /* set intra mode prediction */ + p_cu->cu_info.pred_intra_modes[blockidx] = (int8_t)best_pmode; + p_cu->cu_info.real_intra_modes[blockidx] = (int8_t)best_mode; + + /* copy coding state */ + h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); + } + + /* 淇濆瓨鏈浼樻ā寮忕殑鐘舵侊細澶辩湡銆佷寒搴﹀垎閲忔瘮鐗规暟锛堟帓闄ゆ帀浜害棰勬祴妯″紡锛夛紝CBP */ + rdcost_luma += best_dist + h->f_lambda_mode * best_rate; + p_cu->cu_info.i_cbp |= (best_cbp) << blockidx; + + /* 浜害鍧桼DO鐨勬彁鍓嶇粓姝 */ + if (rdcost_luma >= *min_rdcost) { + p_layer->mode_rdcost[mode] = MAX_COST; /* set the cost for SDIP fast algorithm */ + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); + return; // 浜害鍧楃殑鏈浼榬dcost宸茬粡瓒呰繃褰撳墠鏈浼樺硷紝鍋滄鍚庣画鑹插害鍧楃殑妯″紡閬嶅巻 + } + } + p_cu->feature.rdcost_luma = rdcost_luma; + + /* 2, store best luma reconstruction pixels */ + for (blockidx = 0; blockidx < num_luma_block; blockidx++) { + if (p_best_part[blockidx] != p_cu->cu_info.p_rec10[0]) { + int offset = p_cu->cu_info.cb[blockidx].y * FREC_STRIDE + p_cu->cu_info.cb[blockidx].x; + int offset_coeff = blockidx << ((p_cu->cu_info.i_level - 1) << 1); + int w_tr = p_cu->cu_info.cb[0].w; + int h_tr = p_cu->cu_info.cb[0].h; + int part_idx = PART_INDEX(w_tr, h_tr); + g_funcs.pixf.copy_pp10[part_idx](p_cu->cu_info.p_rec10[0] + offset, FREC_STRIDE, p_layer->p_rec10_tmp[0] + offset, p_cu->i_size); + g_funcs.pixf.copy_ss10[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr); + } + } + + /* 3, Chroma mode decision and CU mode updating */ + if (h->param->chroma_format != CHROMA_400) { + int lmode; + int num_rdo_chroma_mode; + int idx_chroma_mode; + int tmp_cbp_luma = p_cu->cu_info.i_cbp; + + lmode = tab_intra_mode_luma2chroma[p_cu->cu_info.real_intra_modes[0]]; + num_rdo_chroma_mode = h->get_intra_candidates_chroma(h, p_cu, level - 1, pix_y_c, pix_x_c, p_candidates); + + for (idx_chroma_mode = 0; idx_chroma_mode < num_rdo_chroma_mode; idx_chroma_mode++) { + dist_t dist_chroma = 0; // 鑹插害鍧楃殑鎸囬拡 + int rate_chroma = 0; + int bits_left; + int predmode_c = p_candidates[idx_chroma_mode].mode; + int cbp_c; + + /* 璺宠繃鑹插害鍒嗛噺绗簩娆¤皟鐢ㄨ繃绋嬩腑鐨勬ā寮忛夋嫨锛岀洿鎺ラ夊埌鏈浼樻ā寮忓畬鎴怰DOQ */ + if ((h->param->i_rdoq_level == RDOQ_CU_LEVEL && h->lcu.b_enable_rdoq) && predmode_c != best->i_intra_mode_c) { + continue; + } + if (predmode_c != DM_PRED_C && predmode_c == lmode) { + continue; + } + p_cu->cu_info.i_intra_mode_c = (int8_t)predmode_c; + + /* 瀹屾垚RDO杩囩▼鐨勮壊搴﹀潡鐨勯噸鏋勮繃绋嬶紙鍙樻崲銆侀噺鍖栥佸弽鍙樻崲鍙嶉噺鍖栧強姹傞噸鏋勫硷級 */ + cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma); + + p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp_luma + cbp_c); + + /* ------- GET RATE -------- */ + rate_chroma = p_aec->binary.est_cu_header(h, p_aec, p_cu); +#if ENABLE_RATE_CONTROL_CU + rate_chroma += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant); +#else + rate_chroma += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h); +#endif + + bits_left = rdo_get_left_bits(h, *min_rdcost - rdcost_luma, dist_chroma); + + if (p_cu->cu_info.i_cbp & (1 << 4)) { + int cur_bits_left = bits_left - rate_chroma; + rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[1], &p_enc->runlevel, level - 1, cur_bits_left); + } + if (p_cu->cu_info.i_cbp & (1 << 5)) { + int cur_bits_left = bits_left - rate_chroma; + rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[2], &p_enc->runlevel, level - 1, cur_bits_left); + } + + rdcost = dist_chroma + h->f_lambda_mode * rate_chroma + rdcost_luma; + + min_mode_rdcost = XAVS2_MIN(rdcost, min_mode_rdcost); + + if (rdcost < *min_rdcost) { + *min_rdcost = rdcost; + h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); /* store coding state for the best mode */ + cu_store_parameters(h, p_cu, best); + b_need_swap_buf = 1; + } + + h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* revert to AEC context of best Luma mode */ + + if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) { + if (rdcost > *min_rdcost * 2 || + cbp_c == 0) { + break; + } + } + } + } else { /* YUV400 */ + /* ------- GET RATE -------- */ + int rate_hdr = p_aec->binary.est_cu_header(h, p_aec, p_cu); +#if ENABLE_RATE_CONTROL_CU + rate_hdr += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant); +#else + rate_hdr += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h); +#endif + rdcost = h->f_lambda_mode * rate_hdr + rdcost_luma; + + if (rdcost < *min_rdcost) { + *min_rdcost = rdcost; + h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); /* store coding state for the best mode */ + cu_store_parameters(h, p_cu, best); + b_need_swap_buf = 1; + } + } + + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); /* revert to initial AEC context */ + + /* 4, confirm the buffer pointers and record the best information */ + if (best->p_rec10[0] == rec_bak_y && b_need_swap_buf) { + XAVS2_SWAP_PTR(best->p_rec10[0], p_cu->cu_info.p_rec10[0]); + XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]); + } + + p_layer->mode_rdcost[mode] = min_mode_rdcost; /* store the cost for SDIP fast algorithm */ + } +} + +//#if OPT_BYPASS_SDIP +/* --------------------------------------------------------------------------- + * SDIP fast + */ +static ALWAYS_INLINE int sdip_early_bypass(xavs2_t *h, cu_layer_t *p_layer, int i_mode) +{ + UNUSED_PARAMETER(h); + return i_mode == PRED_I_nx2N && (p_layer->mode_rdcost[PRED_I_2Nxn] < p_layer->mode_rdcost[PRED_I_2Nx2N] * 0.9); +} +//#endif + +/** + * =========================================================================== + * local function defines (inter) + * =========================================================================== + */ + +//#if OPT_FAST_ZBLOCK || OPT_ECU +static const int tab_th_zero_block_sad[][5] = { + { 7, 19, 72, 281, 1115 }, { 7, 19, 73, 281, 1116 }, { 7, 20, 73, 282, 1118 }, + { 8, 20, 74, 283, 1120 }, { 8, 20, 74, 284, 1122 }, { 8, 20, 75, 285, 1124 }, + { 8, 21, 75, 286, 1126 }, { 8, 21, 76, 288, 1129 }, { 9, 21, 77, 289, 1132 }, + { 9, 22, 77, 291, 1135 }, { 9, 22, 78, 292, 1138 }, { 10, 23, 79, 294, 1142 }, + { 10, 23, 80, 296, 1146 }, { 10, 24, 81, 298, 1150 }, { 11, 24, 82, 301, 1155 }, + { 11, 25, 84, 303, 1160 }, { 12, 26, 85, 306, 1166 }, { 12, 26, 87, 309, 1172 }, + { 13, 27, 88, 312, 1179 }, { 13, 28, 90, 316, 1186 }, { 14, 29, 92, 320, 1194 }, + { 15, 30, 94, 325, 1203 }, { 15, 31, 97, 329, 1213 }, { 16, 33, 99, 334, 1223 }, + { 17, 34, 102, 340, 1235 }, { 18, 36, 105, 346, 1247 }, { 20, 37, 109, 353, 1260 }, + { 21, 39, 112, 360, 1275 }, { 22, 41, 116, 368, 1292 }, { 24, 43, 121, 377, 1309 }, + { 25, 46, 125, 386, 1328 }, { 27, 48, 131, 397, 1349 }, { 29, 51, 136, 408, 1372 }, + { 31, 54, 142, 420, 1397 }, { 33, 58, 149, 434, 1424 }, { 36, 61, 156, 448, 1453 }, + { 38, 65, 164, 464, 1485 }, { 41, 70, 173, 482, 1520 }, { 45, 74, 183, 501, 1559 }, + { 48, 79, 193, 521, 1600 }, { 52, 85, 204, 544, 1646 }, { 56, 91, 217, 569, 1696 }, + { 61, 98, 230, 596, 1750 }, { 66, 105, 245, 625, 1809 }, { 71, 113, 261, 657, 1873 }, + { 77, 122, 278, 692, 1944 }, { 83, 132, 297, 729, 2020 }, { 90, 142, 318, 771, 2104 }, + { 98, 153, 341, 816, 2195 }, { 106, 166, 365, 865, 2294 }, { 116, 179, 392, 919, 2403 }, + { 126, 194, 422, 978, 2521 }, { 136, 210, 454, 1042, 2649 }, { 148, 227, 488, 1111, 2790 }, + { 161, 246, 526, 1187, 2943 }, { 175, 267, 568, 1270, 3110 }, { 191, 290, 613, 1360, 3292 }, + { 207, 314, 662, 1459, 3491 }, { 225, 341, 716, 1566, 3707 }, { 245, 370, 775, 1683, 3944 }, + { 267, 402, 839, 1811, 4201 }, { 291, 437, 909, 1950, 4482 }, { 316, 475, 985, 2102, 4788 }, + { 345, 517, 1068, 2268, 5123 }, { 375, 562, 1158, 2448, 5487 }, { 412, 617, 1268, 2667, 5928 }, + { 445, 665, 1364, 2860, 6317 }, { 485, 724, 1482, 3094, 6790 }, { 528, 788, 1610, 3350, 7305 }, + { 576, 858, 1749, 3628, 7867 }, { 631, 939, 1912, 3954, 8524 }, { 687, 1022, 2078, 4285, 9192 }, + { 748, 1113, 2259, 4647, 9920 }, { 812, 1206, 2446, 5019, 10671 }, { 884, 1313, 2661, 5448, 11537 }, + { 964, 1431, 2895, 5917, 12482 }, { 1047, 1553, 3140, 6406, 13469 }, { 1145, 1698, 3430, 6985, 14636 }, + { 1248, 1850, 3735, 7592, 15862 }, { 1357, 2011, 4055, 8233, 17154 } +}; + +/* --------------------------------------------------------------------------- + */ +static ALWAYS_INLINE +bool_t isZeroCuFast(xavs2_t *h, cu_t *p_cu) +{ + int i_level = p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT; + int i_qp = cu_get_qp(h, &p_cu->cu_info); + int thres_satd = (int)(tab_th_zero_block_sad[i_qp][i_level] * h->param->factor_zero_block); + + return p_cu->sum_satd < thres_satd; +} +//#endif + +/* --------------------------------------------------------------------------- + * int scrFlag = 0; // 0=noSCR, 1=strongSCR, 2=jmSCR + */ +static INLINE int +tu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, + int i_level, int8_t *cbp, int blockidx, coeff_t *cur_blk, + int x_pu, int y_pu, int w_pu, int h_pu) +{ + cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); + int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); + int part_idx = PART_INDEX(w_pu, h_pu); + int w_tr = w_pu >> used_wavelet; + int h_tr = h_pu >> used_wavelet; + int num_non_zero = 0; + coeff_t *coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1)); + + tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); + + num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_level, w_tr, h_tr, + cu_get_qp(h, &p_cu->cu_info), 0, 1, DC_PRED); + + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_fdec = p_cu->cu_info.p_rec8[0] + y_pu * FREC_STRIDE + x_pu; + pel8_t *p_pred = p_layer->buf_pred_inter8 + y_pu * FREC_STRIDE + x_pu; + if (num_non_zero != 0) { + *cbp |= (1 << blockidx); // 鎸囧畾浣嶈缃负 1 + g_funcs.pixf.copy_ss8[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr); + + tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1); + g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr); + + g_funcs.pixf.add_ps8[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu); + } else { + /* 娓呴櫎CBP鎸囧畾浣嶇殑鍊硷紝杩欓噷CBP鍒濆鍊间负0锛屽洜鑰屾棤闇鎿嶄綔 */ + // 鍏ㄩ浂鍧椾笉蹇呭仛鍙嶅彉鎹㈠弽閲忓寲锛屽彧闇鎷疯礉棰勬祴鍊间负閲嶆瀯鍊 + coeff_y[0] = 0; + if (p_cu->cu_info.i_tu_split) { + g_funcs.pixf.copy_pp8[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE); + } + } + } else { + pel10_t *p_fdec = p_cu->cu_info.p_rec10[0] + y_pu * FREC_STRIDE + x_pu; + pel10_t *p_pred = p_layer->buf_pred_inter10 + y_pu * FREC_STRIDE + x_pu; + if (num_non_zero != 0) { + *cbp |= (1 << blockidx); // 鎸囧畾浣嶈缃负 1 + g_funcs.pixf.copy_ss10[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr); + + tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1); + g_funcs.dctf.idct[part_idx](h, cur_blk, cur_blk, w_tr); + + g_funcs.pixf.add_ps10[part_idx](h, p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu); } else { - /* 清除CBP指定位的值,这里CBP初始值为0,因而无需操作 */ - // 全零块不必做反变换反量化,只需拷贝预测值为重构值 + /* 娓呴櫎CBP鎸囧畾浣嶇殑鍊硷紝杩欓噷CBP鍒濆鍊间负0锛屽洜鑰屾棤闇鎿嶄綔 */ + // 鍏ㄩ浂鍧椾笉蹇呭仛鍙嶅彉鎹㈠弽閲忓寲锛屽彧闇鎷疯礉棰勬祴鍊间负閲嶆瀯鍊 coeff_y[0] = 0; if (p_cu->cu_info.i_tu_split) { - g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE); + g_funcs.pixf.copy_pp10[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE); + } + } + } + + return num_non_zero; +} + + +/* --------------------------------------------------------------------------- + * 浠ユ寚瀹氭柟寮忛噸鏋勫抚闂撮娴嬫柟寮忕殑CU鐨勪寒搴﹀垎閲忥紱 + * 杩斿洖褰撳墠CU鍦板け鐪燂紙鍔犱笂鑹插害鍧楀け鐪燂級 + */ +static +dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, + int is_non_residual, int b_tu_split, + int cbp_c, dist_t dist_chroma) +{ + cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); + cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); + coeff_t *cur_blk = p_enc->coeff_blk; + coeff_t *coeff_bak = p_enc->coeff_bak; + coeff_t *p_resi; + int level = p_cu->cu_info.i_level; + int num_nonzero = 0; + int sum_dc_coeff = 0; + int b_zero_block = 0; + int blockidx; + int pix_x = p_cu->i_pos_x; + int pix_y = p_cu->i_pos_y; + int cu_size = p_cu->i_size; + int cu_size_2 = cu_size >> 1; + int cu_size_4 = cu_size_2 >> 1; + dist_t distortion; + + /* clear CBP */ + p_cu->cu_info.i_cbp = 0; + + /* encode for luma */ + cu_set_tu_split_type(h, &p_cu->cu_info, b_tu_split); + + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_fenc; + pel8_t *p_fdec; + if (is_non_residual) { /* SKIP mode (or no residual coding) */ + int uvoffset = (FREC_CSTRIDE >> 1); + int part_idx_c = PART_INDEX(cu_size_2, cu_size_2); + int pix_x_c = pix_x >> 1; + int pix_y_c = pix_y >> CHROMA_V_SHIFT; + + h->lcu.bypass_all_dmh |= (p_cu->cu_info.dmh_mode == 0); + /* copy Y component and get distortion */ + p_fenc = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x; + p_fdec = p_cu->cu_info.p_rec8[0]; + g_funcs.pixf.copy_pp8[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + distortion = g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + + /* chroma distortion */ + if (cbp_c) { + /* copy U component and get distortion */ + p_fenc = h->lcu.p_fenc8[1] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fdec = p_cu->cu_info.p_rec8[1]; + g_funcs.pixf.copy_pp8[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter8_c, FREC_CSTRIDE); + distortion += g_funcs.pixf.ssd8[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + + /* copy V component and get distortion */ + p_fenc = h->lcu.p_fenc8[2] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fdec = p_cu->cu_info.p_rec8[2]; + g_funcs.pixf.copy_pp8[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter8_c + uvoffset, FREC_CSTRIDE); + distortion += g_funcs.pixf.ssd8[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + } else { + distortion += dist_chroma; + } + + return distortion; + } else if (p_cu->cu_info.i_tu_split) { + int pix_cu_x = 0; + int pix_cu_y = 0; + + switch (p_cu->cu_info.i_tu_split) { + case TU_SPLIT_HOR: + g_funcs.pixf.copy_ss8[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size); + for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_y += cu_size_4) { + p_resi = cur_blk + pix_cu_y * cu_size + pix_cu_x; + num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, p_resi, pix_cu_x, pix_cu_y, cu_size, cu_size_4); + sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); + } + break; + case TU_SPLIT_VER: + for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_x += cu_size_4) { + p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; + g_funcs.pixf.copy_ss8[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size); + num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_4, cu_size); + sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); + } + break; + default: + for (blockidx = 0; blockidx < 4; blockidx++) { + pix_cu_x = (blockidx & 1) * cu_size_2; + pix_cu_y = (blockidx >> 1) * cu_size_2; + p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; + g_funcs.pixf.copy_ss8[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size); + num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_2, cu_size_2); + sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); + } + break; } - } - - return num_non_zero; -} + // 褰撳墠CU闈為浂绯绘暟涓嶅ぇ浜 LUMA_COEFF_COST 涓紝涓擠C绯绘暟骞朵笉澶х殑鎯呭喌涓嬶紝鍙瀹氫负鍏ㄩ浂鍧 + b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); + } else { + if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block) { + b_zero_block = 1; + } else { + num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level, &p_cu->cu_info.i_cbp, 0, coeff_bak, 0, 0, cu_size, cu_size); -/* --------------------------------------------------------------------------- - * 以指定方式重构帧间预测方式的CU的亮度分量; - * 返回当前CU地失真(加上色度块失真) - */ -static -dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, - int is_non_residual, int b_tu_split, - int cbp_c, dist_t dist_chroma) -{ - cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); - cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); - coeff_t *cur_blk = p_enc->coeff_blk; - coeff_t *coeff_bak = p_enc->coeff_bak; - coeff_t *p_resi; - int level = p_cu->cu_info.i_level; - int num_nonzero = 0; - int sum_dc_coeff = 0; - int b_zero_block = 0; - int blockidx; - int pix_x = p_cu->i_pos_x; - int pix_y = p_cu->i_pos_y; - int cu_size = p_cu->i_size; - int cu_size_2 = cu_size >> 1; - int cu_size_4 = cu_size_2 >> 1; - dist_t distortion; - pel_t *p_fenc; - pel_t *p_fdec; + // 褰撳墠CU鐨勬墍鏈夊彉鎹㈠潡鐨勯潪闆剁郴鏁版暟閲忥紝涓嶅ぇ浜 LUMA_COEFF_COST 涓紝涓擠C绯绘暟骞朵笉澶х殑鎯呭喌涓嬶紝鍙瀹氫负鍏ㄩ浂鍧 + sum_dc_coeff = XAVS2_ABS(p_cu->cu_info.p_coeff[0][0]); + b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); + } + } - /* clear CBP */ - p_cu->cu_info.i_cbp = 0; + if (b_zero_block) { + h->lcu.bypass_all_dmh |= (h->i_type == SLICE_TYPE_F && p_cu->cu_info.dmh_mode == 0); + p_cu->cu_info.i_cbp = 0; + g_funcs.pixf.copy_pp8[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec8[0], FREC_STRIDE, + p_layer->buf_pred_inter8, FREC_STRIDE); + } - /* encode for luma */ - cu_set_tu_split_type(h, &p_cu->cu_info, b_tu_split); + /* set CBP */ + p_cu->cu_info.i_cbp += (int8_t)cbp_c; + /* luma distortion */ + p_fenc = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x; + p_fdec = p_cu->cu_info.p_rec8[0]; + distortion = dist_chroma; + distortion += g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + return distortion; + } else { + pel10_t *p_fenc; + pel10_t *p_fdec; if (is_non_residual) { /* SKIP mode (or no residual coding) */ int uvoffset = (FREC_CSTRIDE >> 1); int part_idx_c = PART_INDEX(cu_size_2, cu_size_2); @@ -1412,24 +1949,24 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, h->lcu.bypass_all_dmh |= (p_cu->cu_info.dmh_mode == 0); /* copy Y component and get distortion */ - p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; - p_fdec = p_cu->cu_info.p_rec[0]; - g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); - distortion = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + p_fenc = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x; + p_fdec = p_cu->cu_info.p_rec10[0]; + g_funcs.pixf.copy_pp10[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); + distortion = g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); /* chroma distortion */ if (cbp_c) { /* copy U component and get distortion */ - p_fenc = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c; - p_fdec = p_cu->cu_info.p_rec[1]; - g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c, FREC_CSTRIDE); - distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + p_fenc = h->lcu.p_fenc10[1] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fdec = p_cu->cu_info.p_rec10[1]; + g_funcs.pixf.copy_pp10[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter10_c, FREC_CSTRIDE); + distortion += g_funcs.pixf.ssd10[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); /* copy V component and get distortion */ - p_fenc = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c; - p_fdec = p_cu->cu_info.p_rec[2]; - g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c + uvoffset, FREC_CSTRIDE); - distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); + p_fenc = h->lcu.p_fenc10[2] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fdec = p_cu->cu_info.p_rec10[2]; + g_funcs.pixf.copy_pp10[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter10_c + uvoffset, FREC_CSTRIDE); + distortion += g_funcs.pixf.ssd10[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); } else { distortion += dist_chroma; } @@ -1441,7 +1978,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, switch (p_cu->cu_info.i_tu_split) { case TU_SPLIT_HOR: - g_funcs.pixf.copy_ss[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size); + g_funcs.pixf.copy_ss10[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size); for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_y += cu_size_4) { p_resi = cur_blk + pix_cu_y * cu_size + pix_cu_x; num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, p_resi, pix_cu_x, pix_cu_y, cu_size, cu_size_4); @@ -1451,7 +1988,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, case TU_SPLIT_VER: for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_x += cu_size_4) { p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; - g_funcs.pixf.copy_ss[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size); + g_funcs.pixf.copy_ss10[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size); num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_4, cu_size); sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); } @@ -1461,14 +1998,14 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pix_cu_x = (blockidx & 1) * cu_size_2; pix_cu_y = (blockidx >> 1) * cu_size_2; p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; - g_funcs.pixf.copy_ss[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size); + g_funcs.pixf.copy_ss10[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size); num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_2, cu_size_2); sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); } break; } - // 当前CU非零系数不大于 LUMA_COEFF_COST 个,且DC系数并不大的情况下,可认定为全零块 + // 褰撳墠CU闈為浂绯绘暟涓嶅ぇ浜 LUMA_COEFF_COST 涓紝涓擠C绯绘暟骞朵笉澶х殑鎯呭喌涓嬶紝鍙瀹氫负鍏ㄩ浂鍧 b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); } else { if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block) { @@ -1476,7 +2013,7 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, } else { num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level, &p_cu->cu_info.i_cbp, 0, coeff_bak, 0, 0, cu_size, cu_size); - // 当前CU的所有变换块的非零系数数量,不大于 LUMA_COEFF_COST 个,且DC系数并不大的情况下,可认定为全零块 + // 褰撳墠CU鐨勬墍鏈夊彉鎹㈠潡鐨勯潪闆剁郴鏁版暟閲忥紝涓嶅ぇ浜 LUMA_COEFF_COST 涓紝涓擠C绯绘暟骞朵笉澶х殑鎯呭喌涓嬶紝鍙瀹氫负鍏ㄩ浂鍧 sum_dc_coeff = XAVS2_ABS(p_cu->cu_info.p_coeff[0][0]); b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); } @@ -1485,19 +2022,20 @@ dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, if (b_zero_block) { h->lcu.bypass_all_dmh |= (h->i_type == SLICE_TYPE_F && p_cu->cu_info.dmh_mode == 0); p_cu->cu_info.i_cbp = 0; - g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec[0], FREC_STRIDE, - p_layer->buf_pred_inter, FREC_STRIDE); + g_funcs.pixf.copy_pp10[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec10[0], FREC_STRIDE, + p_layer->buf_pred_inter10, FREC_STRIDE); } /* set CBP */ p_cu->cu_info.i_cbp += (int8_t)cbp_c; /* luma distortion */ - p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; - p_fdec = p_cu->cu_info.p_rec[0]; + p_fenc = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x; + p_fdec = p_cu->cu_info.p_rec10[0]; distortion = dist_chroma; - distortion += g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); + distortion += g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); return distortion; + } } /* --------------------------------------------------------------------------- @@ -1584,7 +2122,7 @@ static int tu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, } /* --------------------------------------------------------------------------- - * 获取亮度、色度分量的预测像素值,返回MV是否在有效范围内 + * 鑾峰彇浜害銆佽壊搴﹀垎閲忕殑棰勬祴鍍忕礌鍊硷紝杩斿洖MV鏄惁鍦ㄦ湁鏁堣寖鍥村唴 */ static ALWAYS_INLINE int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma) @@ -1603,16 +2141,14 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma) int pix_x = p_cu->i_pix_x + start_x; int pix_y = p_cu->i_pix_y + start_y; - mv_t mv_1st, mv_2nd; // 第一(前向或者B帧单向预测)和第二(后向)运动矢量 - int ref_1st, ref_2nd; // 第一(前向或者B帧单向预测)和第二(后向)参考帧号 + mv_t mv_1st, mv_2nd; // 绗竴锛堝墠鍚戞垨鑰匓甯у崟鍚戦娴嬶級鍜岀浜岋紙鍚庡悜锛夎繍鍔ㄧ煝閲 + int ref_1st, ref_2nd; // 绗竴锛堝墠鍚戞垨鑰匓甯у崟鍚戦娴嬶級鍜岀浜岋紙鍚庡悜锛夊弬鑰冨抚鍙 int num_mvs; - int b_mv_valid; // MV是否有效:大小取值是否在标准规定的有效范围内 - pel_t *p_temp = p_enc->buf_pixel_temp; - pel_t *p_pred; + int b_mv_valid; // MV鏄惁鏈夋晥锛氬ぇ灏忓彇鍊兼槸鍚﹀湪鏍囧噯瑙勫畾鐨勬湁鏁堣寖鍥村唴 xavs2_frame_t *p_ref1 = NULL; xavs2_frame_t *p_ref2 = NULL; - /* MV的数量,大于1为双参考帧/DMH的预测 */ + /* MV鐨勬暟閲忥紝澶т簬1涓哄弻鍙傝冨抚/DMH鐨勯娴 */ num_mvs = cu_get_mvs_for_mc(h, p_cu, blockidx, &mv_1st, &mv_2nd, &ref_1st, &ref_2nd); b_mv_valid = check_mv_range(h, &mv_1st, ref_1st, pix_x, pix_y, width, height); if (num_mvs > 1) { @@ -1628,13 +2164,58 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma) } /* y component */ + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_temp = p_enc->buf_pixel_temp8; + pel8_t *p_pred; + if (cal_luma_chroma & 1) { + p_pred = p_layer->buf_pred_inter8 + start_y * FREC_STRIDE + start_x; + + mc_luma8(h, p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); + if (num_mvs > 1) { + mc_luma8(h, p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2); + g_funcs.pixf.avg8[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32); + } + } + + /* u and v component */ + if (h->param->chroma_format == CHROMA_420 && (cal_luma_chroma & 2)) { + int uvoffset = (FREC_CSTRIDE >> 1); + start_x >>= 1; + width >>= 1; + pix_x >>= 1; + start_y >>= CHROMA_V_SHIFT; + pix_y >>= CHROMA_V_SHIFT; + height >>= CHROMA_V_SHIFT; + + p_pred = p_enc->buf_pred_inter8_c + start_y * FREC_CSTRIDE + start_x; + + /* u component */ + mc_chroma8(h, p_pred, p_pred + uvoffset, FREC_CSTRIDE, + mv_1st.x, mv_1st.y, width, height, p_ref1); + + if (num_mvs > 1) { + mc_chroma8(h, p_temp, p_temp + uvoffset, FREC_CSTRIDE, + mv_2nd.x, mv_2nd.y, width, height, p_ref2); + + if (width != 2 && width != 6 && height != 2 && height != 6) { + pixel_avg_pp8_t func_avg = g_funcs.pixf.avg8[PART_INDEX(width, height)]; + func_avg(p_pred , FREC_CSTRIDE, p_pred , FREC_CSTRIDE, p_temp , FREC_CSTRIDE, 32); + func_avg(p_pred + uvoffset, FREC_CSTRIDE, p_pred + uvoffset, FREC_CSTRIDE, p_temp + uvoffset, FREC_CSTRIDE, 32); + } else { + g_funcs.pixf.average8(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2); + } + } + } + } else { + pel10_t *p_temp = p_enc->buf_pixel_temp10; + pel10_t *p_pred; if (cal_luma_chroma & 1) { - p_pred = p_layer->buf_pred_inter + start_y * FREC_STRIDE + start_x; + p_pred = p_layer->buf_pred_inter10 + start_y * FREC_STRIDE + start_x; - mc_luma(p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); + mc_luma10(h, p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); if (num_mvs > 1) { - mc_luma(p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2); - g_funcs.pixf.avg[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32); + mc_luma10(h, p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2); + g_funcs.pixf.avg10[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32); } } @@ -1648,25 +2229,26 @@ int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma) pix_y >>= CHROMA_V_SHIFT; height >>= CHROMA_V_SHIFT; - p_pred = p_enc->buf_pred_inter_c + start_y * FREC_CSTRIDE + start_x; + p_pred = p_enc->buf_pred_inter10_c + start_y * FREC_CSTRIDE + start_x; /* u component */ - mc_chroma(p_pred, p_pred + uvoffset, FREC_CSTRIDE, + mc_chroma10(h, p_pred, p_pred + uvoffset, FREC_CSTRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); if (num_mvs > 1) { - mc_chroma(p_temp, p_temp + uvoffset, FREC_CSTRIDE, + mc_chroma10(h, p_temp, p_temp + uvoffset, FREC_CSTRIDE, mv_2nd.x, mv_2nd.y, width, height, p_ref2); if (width != 2 && width != 6 && height != 2 && height != 6) { - pixel_avg_pp_t func_avg = g_funcs.pixf.avg[PART_INDEX(width, height)]; + pixel_avg_pp10_t func_avg = g_funcs.pixf.avg10[PART_INDEX(width, height)]; func_avg(p_pred , FREC_CSTRIDE, p_pred , FREC_CSTRIDE, p_temp , FREC_CSTRIDE, 32); func_avg(p_pred + uvoffset, FREC_CSTRIDE, p_pred + uvoffset, FREC_CSTRIDE, p_temp + uvoffset, FREC_CSTRIDE, 32); } else { - g_funcs.pixf.average(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2); + g_funcs.pixf.average10(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2); } } } + } } return 1; @@ -1699,9 +2281,8 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, dist_t dist_split = 0; dist_t dist_notsplit = 0; dist_t best_dist_cur = 0; - rdcost_t rdcost = *min_rdcost; // 初始化为最大可允许的RDCost + rdcost_t rdcost = *min_rdcost; // 鍒濆鍖栦负鏈澶у彲鍏佽鐨凴DCost rdcost_t rdcost_split = rdcost; - pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); @@ -1746,15 +2327,187 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, } } + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; + /* 3.2, check luma CU tu-split type and CBP */ + /* 3.2.1, get luma residual */ + g_funcs.pixf.sub_ps8[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size, + p_fenc, p_layer->buf_pred_inter8, + FENC_STRIDE, FREC_STRIDE); + + /* 3.2.2, Fast algorithm, check whether TU split is essential */ + if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) || IS_ALG_ENABLE(OPT_ECU)) { + p_cu->sum_satd = g_funcs.pixf.sad8[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter8, FREC_STRIDE, p_fenc, FENC_STRIDE); + p_cu->is_zero_block = isZeroCuFast(h, p_cu); + } + + /* only get cost with tu depth equals 1 */ + if ((h->enable_tu_2level == 1) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split != 0))) { + if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) { + b_try_tu_split = FALSE; + } + + if (b_try_tu_split) { + h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */ + + dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma); + tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); + + /* store dct coefficients, rec data and coding state for tu depth = 1*/ + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + } else { + rdcost_split = MAX_COST; + tmp_cbp = 0; + } + if (rdcost_split >= *min_rdcost) { + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); + return 0; /* return code = 0, means it is not the best mode */ + } else { + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost_split, p_layer->mode_rdcost[mode]); + /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ + p_cu->cu_info.i_cbp = (int8_t)tmp_cbp; + *min_rdcost = rdcost_split; + p_cu->best_dist_total = dist_split; + h->copy_aec_state_rdo(&p_layer->cs_cu, &p_enc->cs_tu); + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); + cu_store_parameters(h, p_cu, p_best); + return 1; /* return code = 1, means it is the best mode */ + } + } else if ((h->enable_tu_2level == 0) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split == 0))) { /* only get cost with tu depth equals 0 */ + dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma); + tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost); + } else { + if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) { + b_try_tu_split = FALSE; + } + + if (b_try_tu_split) { + h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */ + + dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma); + tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); + + /* store dct coefficients, rec data and coding state for tu depth = 1*/ + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + } else { + rdcost_split = MAX_COST; + tmp_cbp = 0; + } + + /* 3.2.4, get cost with tu depth equals 0 */ + if (b_try_tu_nonsplit) { + dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma); + tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost); + } + + /* 3.2.5, choose the best tu depth (whether split or not) */ + if (rdcost > rdcost_split) { + /* the best tu depth is 1 */ + rdcost = rdcost_split; + best_dist_cur = dist_split; + cu_set_tu_split_type(h, &p_cu->cu_info, 1); + + /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ + p_cu->cu_info.i_cbp = (int8_t)tmp_cbp; + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); + + h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ + } else { + best_dist_cur = dist_notsplit; + } + } + + if (IS_ALG_ENABLE(OPT_CBP_DIRECT) && IS_SKIP_MODE(mode)) { + /* Skip/Direct妯″紡鐨勬畫宸粡杩囧彉鎹㈤噺鍖栧悗涓哄叏闆跺潡锛 + * 姝ゆ椂缁堟涓嬪眰CU鍒掑垎鍙互寰楀埌杈冨鏃堕棿鑺傜渷涓旀崯澶辫緝灏忥紝 + * 浣嗚烦杩囨櫘閫歅U鍒掑垎妯″紡骞朵笉鑳藉甫鏉ユ洿澶氱殑鍔犻熴 + */ + p_cu->b_cbp_direct = (p_cu->cu_info.i_cbp == 0); + } + + /* 3.3, check skip mode for PRED_SKIP when CBP is nonzero */ + if (IS_SKIP_MODE(p_cu->cu_info.i_mode) && p_cu->cu_info.i_cbp != 0) { + rdcost_t rdcost_skip = MAX_COST; + dist_t dist_total_skip; + int best_tu_split_type = p_cu->cu_info.i_tu_split; + + if (best_tu_split_type == TU_SPLIT_NON) { + h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for best Direct mode */ + } + + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);/* restore coding state */ + + tmp_cbp = p_cu->cu_info.i_cbp; + /* backup reconstruction buffers, prepare for SKIP mode */ + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + if (cbp_c != 0) { + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[1], p_layer->p_rec8_tmp[1]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[2], p_layer->p_rec8_tmp[2]); + } + + /* check SKIP Mode */ + dist_total_skip = cu_recon_inter_luma(h, p_aec, p_cu, 1, 0, cbp_c, dist_chroma); + tu_rdcost_inter(h, p_aec, p_cu, dist_total_skip, rate_chroma, &rdcost_skip); + + if (rdcost_skip <= rdcost) { + rdcost = rdcost_skip; /* skip mode is the best */ + best_dist_cur = dist_total_skip; + p_cu->cu_info.i_tu_split = TU_SPLIT_NON; + } else { + h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ + /* revert buffers */ + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[0], p_layer->p_rec8_tmp[0]); + if (cbp_c != 0) { + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[1], p_layer->p_rec8_tmp[1]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec8[2], p_layer->p_rec8_tmp[2]); + } + + p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp); + p_cu->cu_info.i_tu_split = (int8_t)(best_tu_split_type); + } + } + + /* ------------------------------------------------------------- + * 4, store the min cost for current cu mode + */ + p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost, p_layer->mode_rdcost[mode]); + + /* ------------------------------------------------------------- + * 5, update the min cost, restore the coding state and return + */ + if (rdcost >= *min_rdcost) { + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); + return 0; /* return code = 0, means it is not the best mode */ + } else { + if (mode == PRED_SKIP && IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) { + /* re-cover best skip prediction data */ + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + } + *min_rdcost = rdcost; + p_cu->best_dist_total = best_dist_cur; + /* store coding state for the best mode */ + h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); + h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); + /* update best CU information */ + cu_store_parameters(h, p_cu, p_best); + return 1; /* return code = 1, means it is the best mode */ + } + } else { + pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; /* 3.2, check luma CU tu-split type and CBP */ /* 3.2.1, get luma residual */ - g_funcs.pixf.sub_ps[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size, - p_fenc, p_layer->buf_pred_inter, + g_funcs.pixf.sub_ps10[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size, + p_fenc, p_layer->buf_pred_inter10, FENC_STRIDE, FREC_STRIDE); /* 3.2.2, Fast algorithm, check whether TU split is essential */ if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) || IS_ALG_ENABLE(OPT_ECU)) { - p_cu->sum_satd = g_funcs.pixf.sad[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc, FENC_STRIDE); + p_cu->sum_satd = g_funcs.pixf.sad10[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter10, FREC_STRIDE, p_fenc, FENC_STRIDE); p_cu->is_zero_block = isZeroCuFast(h, p_cu); } @@ -1771,7 +2524,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); /* store dct coefficients, rec data and coding state for tu depth = 1*/ - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); } else { rdcost_split = MAX_COST; @@ -1781,7 +2534,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); return 0; /* return code = 0, means it is not the best mode */ } else { - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost_split, p_layer->mode_rdcost[mode]); /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ @@ -1808,7 +2561,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); /* store dct coefficients, rec data and coding state for tu depth = 1*/ - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); } else { rdcost_split = MAX_COST; @@ -1830,7 +2583,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ p_cu->cu_info.i_cbp = (int8_t)tmp_cbp; - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ @@ -1840,9 +2593,9 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, } if (IS_ALG_ENABLE(OPT_CBP_DIRECT) && IS_SKIP_MODE(mode)) { - /* Skip/Direct模式的残差经过变换量化后为全零块: - * 此时终止下层CU划分可以得到较多时间节省且损失较小, - * 但跳过普通PU划分模式并不能带来更多的加速。 + /* Skip/Direct妯″紡鐨勬畫宸粡杩囧彉鎹㈤噺鍖栧悗涓哄叏闆跺潡锛 + * 姝ゆ椂缁堟涓嬪眰CU鍒掑垎鍙互寰楀埌杈冨鏃堕棿鑺傜渷涓旀崯澶辫緝灏忥紝 + * 浣嗚烦杩囨櫘閫歅U鍒掑垎妯″紡骞朵笉鑳藉甫鏉ユ洿澶氱殑鍔犻熴 */ p_cu->b_cbp_direct = (p_cu->cu_info.i_cbp == 0); } @@ -1861,10 +2614,10 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, tmp_cbp = p_cu->cu_info.i_cbp; /* backup reconstruction buffers, prepare for SKIP mode */ - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); if (cbp_c != 0) { - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]); - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[1], p_layer->p_rec10_tmp[1]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[2], p_layer->p_rec10_tmp[2]); } /* check SKIP Mode */ @@ -1878,10 +2631,10 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, } else { h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ /* revert buffers */ - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[0], p_layer->p_rec10_tmp[0]); if (cbp_c != 0) { - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]); - XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[1], p_layer->p_rec10_tmp[1]); + XAVS2_SWAP_PTR(p_cu->cu_info.p_rec10[2], p_layer->p_rec10_tmp[2]); } p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp); @@ -1903,7 +2656,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, } else { if (mode == PRED_SKIP && IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) { /* re-cover best skip prediction data */ - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); } *min_rdcost = rdcost; p_cu->best_dist_total = best_dist_cur; @@ -1914,6 +2667,7 @@ int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, cu_store_parameters(h, p_cu, p_best); return 1; /* return code = 1, means it is the best mode */ } + } } /* --------------------------------------------------------------------------- @@ -1970,7 +2724,7 @@ rdcost_t cu_rdo_motion_estimation(xavs2_t *h, cu_t *p_cu, xavs2_me_t *p_me, int p_cb = &p_cu->cu_info.cb[block]; cu_get_neighbors(h, p_cu, p_cb); - /* 第一个PU不需要重新进行ME(MVP不变) */ + /* 绗竴涓狿U涓嶉渶瑕侀噸鏂拌繘琛孧E锛圡VP涓嶅彉锛 */ if (dualpred_enabled < 0 && block == 0) { best_fwd_ref = p_mode->ref_idx_single[0]; } else { @@ -2064,14 +2818,14 @@ rdcost_t cu_rdo_motion_estimation(xavs2_t *h, cu_t *p_cu, xavs2_me_t *p_me, int p_cu->cu_info.b8pdir[block] = (int8_t)best_pdir; } - cu_get_mvds(h, p_cu); // 生成MVD + cu_get_mvds(h, p_cu); // 鐢熸垚MVD - return total_cost; // 返回最小Cost + return total_cost; // 杩斿洖鏈灏廋ost } //#if OPT_DMH_CANDIDATE /* --------------------------------------------------------------------------- - * 提前获取最优的DMH模式候选,减少RDO次数 + * 鎻愬墠鑾峰彇鏈浼樼殑DMH妯″紡鍊欓夛紝鍑忓皯RDO娆℃暟 */ static int dmh_bits[9] = { // 0, 3, 3, 4, 4, 5, 5, 5, 5 @@ -2082,22 +2836,46 @@ static int rdo_get_dmh_candidate(xavs2_t *h, cu_t *p_cu, rdcost_t rdcost_non_dmh { const int num_dmh_modes = DMH_MODE_NUM + DMH_MODE_NUM - 1; int cu_size = 1 << p_cu->cu_info.i_level; - pixel_ssd_t cmp_dmh = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)]; rdcost_t min_distotion = MAX_COST; dist_t distortion; rdcost_t cost; int best_dmh_cand = -1; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); - pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; int i; int rate; - /* 遍历DMH模式执行预测并计算失真,取失真最小的一个模式作为DMH候选集 */ + /* 閬嶅巻DMH妯″紡鎵ц棰勬祴骞惰绠楀け鐪燂紝鍙栧け鐪熸渶灏忕殑涓涓ā寮忎綔涓篋MH鍊欓夐泦 */ + if (h->param->input_sample_bit_depth == 8) { + pixel8_ssd_t cmp_dmh = g_funcs.pixf.ssd8[PART_INDEX(cu_size, cu_size)]; + pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; + for (i = 1; i < num_dmh_modes; i++) { + /* get prediction data and luma distortion */ + p_cu->cu_info.dmh_mode = (int8_t)(i); + if (rdo_get_pred_inter(h, p_cu, 1)) { + rate = dmh_bits[i]; + distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + cost = distortion + h->f_lambda_mode * rate; + if (cost < min_distotion) { + min_distotion = cost; + best_dmh_cand = i; + } + } + } + + if (IS_ALG_ENABLE(OPT_SKIP_DMH_THRES) && min_distotion > (rdcost_t)(1.2 * rdcost_non_dmh)) { + /* 涓嶈冭檻娈嬪樊缂栫爜甯︽潵鐨刣istortion鍑忓皯 */ + return -1; + } else { + return best_dmh_cand; + } + } else { + pixel10_ssd_t cmp_dmh = g_funcs.pixf.ssd10[PART_INDEX(cu_size, cu_size)]; + pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; for (i = 1; i < num_dmh_modes; i++) { /* get prediction data and luma distortion */ p_cu->cu_info.dmh_mode = (int8_t)(i); if (rdo_get_pred_inter(h, p_cu, 1)) { rate = dmh_bits[i]; - distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); cost = distortion + h->f_lambda_mode * rate; if (cost < min_distotion) { min_distotion = cost; @@ -2107,17 +2885,18 @@ static int rdo_get_dmh_candidate(xavs2_t *h, cu_t *p_cu, rdcost_t rdcost_non_dmh } if (IS_ALG_ENABLE(OPT_SKIP_DMH_THRES) && min_distotion > (rdcost_t)(1.2 * rdcost_non_dmh)) { - /* 不考虑残差编码带来的distortion减少 */ + /* 涓嶈冭檻娈嬪樊缂栫爜甯︽潵鐨刣istortion鍑忓皯 */ return -1; } else { return best_dmh_cand; } + } } //#endif /* --------------------------------------------------------------------------- - * 尝试所有帧间预测块划分方式,选择一个最优的划分 + * 灏濊瘯鎵鏈夊抚闂撮娴嬪潡鍒掑垎鏂瑰紡锛岄夋嫨涓涓渶浼樼殑鍒掑垎 */ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32_t inter_modes, cu_info_t *best, rdcost_t *p_min_rdcost, @@ -2133,7 +2912,6 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32 int pix_y = p_cu->i_pix_y; int pix_x_c = pix_x >> 1; int pix_y_c = pix_y >> CHROMA_V_SHIFT; - pel_t *p_fenc[3]; int i; int64_t min_cost = MAX_COST; int64_t mecost; @@ -2145,14 +2923,71 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32 //inter_modes |= (uint32_t)((1 << PRED_2NxN) | (1 << PRED_Nx2N)); + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_fenc[3]; + for (mode = 1; mode < MAX_INTER_MODES; mode++) { + /* 鎵ц杩愬姩浼拌 */ + + if (!(inter_modes & (1 << mode))) { + continue; // 鐩存帴璺宠繃涓嶅彲鐢ㄦā寮忕殑鍐崇瓥 + } + + /* 蹇熷喅绛(OPT_BYPASS_AMP)锛氬鏋淧2NxN鏈幏寰楁渶浼橈紝鐩存帴璺宠繃鐩稿悓鍒掑垎鏂瑰悜鐨凱RED_2NxnU/PRED_2NxnD; PNx2N鍚岀悊 */ + if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) { + if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best_cu_mode != PRED_2NxN) { + continue; + } else if ((mode == PRED_nLx2N || mode == PRED_nRx2N) && best_cu_mode != PRED_Nx2N) { + continue; + } + } + + p_cu->cu_info.i_mode = (int8_t)mode; + cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode); + cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled); + + /* 浼拌Cost閫夊彇鏈灏忕殑 */ + p_cu->cu_info.directskip_wsm_idx = 0; + p_cu->cu_info.directskip_mhp_idx = DS_NONE; + p_cu->cu_info.dmh_mode = 0; + + rdo_get_pred_inter(h, p_cu, 3); + p_fenc[0] = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x; + p_fenc[1] = h->lcu.p_fenc8[1] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fenc[2] = h->lcu.p_fenc8[2] + pix_y_c * FENC_STRIDE + pix_x_c; + + mecost = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter8, FREC_STRIDE, p_fenc[0], FENC_STRIDE); + mecost += g_funcs.pixf.sa8d8[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter8_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE); + mecost += g_funcs.pixf.sa8d8[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter8_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE); + + for (i = 0; i < p_cu->cu_info.num_pu; i++) { + mecost += p_cu->mvcost[i]; + ref1 = p_cu->cu_info.ref_idx_1st[i]; + ref2= p_cu->cu_info.ref_idx_2nd[i]; + if (h->i_type != SLICE_TYPE_B) { + mecost += (ref1 == INVALID_REF? 0: REF_COST(ref1)); + mecost += (ref2 == INVALID_REF? 0: REF_COST(ref2)); + } + } + + if (mecost < min_cost) { + memcpy(&p_layer->cu_mode.best_mc_tmp, &p_cu->mc, sizeof(p_cu->mc)); + memcpy(best, &p_cu->cu_info, sizeof(cu_info_t)); + min_cost = mecost; + best_cu_mode = mode; + } + } + + return best_cu_mode; + } else { + pel10_t *p_fenc[3]; for (mode = 1; mode < MAX_INTER_MODES; mode++) { - /* 执行运动估计 */ + /* 鎵ц杩愬姩浼拌 */ if (!(inter_modes & (1 << mode))) { - continue; // 直接跳过不可用模式的决策 + continue; // 鐩存帴璺宠繃涓嶅彲鐢ㄦā寮忕殑鍐崇瓥 } - /* 快速决策(OPT_BYPASS_AMP):如果P2NxN未获得最优,直接跳过相同划分方向的PRED_2NxnU/PRED_2NxnD; PNx2N同理 */ + /* 蹇熷喅绛(OPT_BYPASS_AMP)锛氬鏋淧2NxN鏈幏寰楁渶浼橈紝鐩存帴璺宠繃鐩稿悓鍒掑垎鏂瑰悜鐨凱RED_2NxnU/PRED_2NxnD; PNx2N鍚岀悊 */ if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) { if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best_cu_mode != PRED_2NxN) { continue; @@ -2165,19 +3000,19 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32 cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode); cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled); - /* 估计Cost选取最小的 */ + /* 浼拌Cost閫夊彇鏈灏忕殑 */ p_cu->cu_info.directskip_wsm_idx = 0; p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.dmh_mode = 0; rdo_get_pred_inter(h, p_cu, 3); - p_fenc[0] = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; - p_fenc[1] = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c; - p_fenc[2] = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fenc[0] = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x; + p_fenc[1] = h->lcu.p_fenc10[1] + pix_y_c * FENC_STRIDE + pix_x_c; + p_fenc[2] = h->lcu.p_fenc10[2] + pix_y_c * FENC_STRIDE + pix_x_c; - mecost = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc[0], FENC_STRIDE); - mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE); - mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE); + mecost = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter10, FREC_STRIDE, p_fenc[0], FENC_STRIDE); + mecost += g_funcs.pixf.sa8d10[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter10_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE); + mecost += g_funcs.pixf.sa8d10[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter10_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE); for (i = 0; i < p_cu->cu_info.num_pu; i++) { mecost += p_cu->mvcost[i]; @@ -2198,10 +3033,11 @@ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32 } return best_cu_mode; + } } /* --------------------------------------------------------------------------- - * 尝试普通帧间预测块划分方式,并计算相应的Cost + * 灏濊瘯鏅氬抚闂撮娴嬪潡鍒掑垎鏂瑰紡锛屽苟璁$畻鐩稿簲鐨凜ost */ static void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, int i_level, @@ -2216,13 +3052,13 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in h->lcu.bypass_all_dmh = 0; - /* 计算一个帧间划分模式的RDCost,以确定最优编码模式 */ + /* 璁$畻涓涓抚闂村垝鍒嗘ā寮忕殑RDCost锛屼互纭畾鏈浼樼紪鐮佹ā寮 */ p_cu->cu_info.directskip_wsm_idx = 0; p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.dmh_mode = 0; cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, best); - /* 检查DMH模式 */ + /* 妫鏌MH妯″紡 */ if (h->i_type == SLICE_TYPE_F && h->param->enable_dmh && !h->lcu.bypass_all_dmh && b_check_dmh && !(i_level == B8X8_IN_BIT && mode != PRED_2Nx2N)) { // disable 8x4 or 4x8 2MVs/PU mode int dmh_mode_candidate = 0; @@ -2232,26 +3068,26 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in if (p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD && p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) { - /* ME确定的最优的PU预测方向均为单前向,此时只需要检查后续DMH模式 */ + /* ME纭畾鐨勬渶浼樼殑PU棰勬祴鏂瑰悜鍧囦负鍗曞墠鍚戯紝姝ゆ椂鍙渶瑕佹鏌ュ悗缁璂MH妯″紡 */ dmh_mode = 1; - } else { // DHP 开启且参考帧数量为2时才有可能上述条件不成立 - /* 最优的PU中包含双前向块,此时需要计算PU均为单前向时的RDCosts,再遍历后续DMH模式 */ - /* 此时需重新ME,同时第一个PU不需要重新搜索 */ + } else { // DHP 寮鍚笖鍙傝冨抚鏁伴噺涓2鏃舵墠鏈夊彲鑳戒笂杩版潯浠朵笉鎴愮珛 + /* 鏈浼樼殑PU涓寘鍚弻鍓嶅悜鍧楋紝姝ゆ椂闇瑕佽绠桺U鍧囦负鍗曞墠鍚戞椂鐨凴DCosts锛屽啀閬嶅巻鍚庣画DMH妯″紡 */ + /* 姝ゆ椂闇閲嶆柊ME锛屽悓鏃剁涓涓狿U涓嶉渶瑕侀噸鏂版悳绱 */ cu_rdo_motion_estimation(h, p_cu, &h->me_state, -1); dmh_mode = 0; } - /* 总计 2 * (DMH_MODE_NUM - 1) + 1 个模式 */ + /* 鎬昏 2 * (DMH_MODE_NUM - 1) + 1 涓ā寮 */ max_dmh_mode = DMH_MODE_NUM + DMH_MODE_NUM - 1; - /* 快速算法,从DMH可选模式中估计最需要做的模式 - * 避免依次遍历所有模式巨大的计算量 + /* 蹇熺畻娉曪紝浠嶥MH鍙夋ā寮忎腑浼拌鏈闇瑕佸仛鐨勬ā寮 + * 閬垮厤渚濇閬嶅巻鎵鏈夋ā寮忓法澶х殑璁$畻閲 */ if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) { dmh_mode_candidate = rdo_get_dmh_candidate(h, p_cu, *p_min_rdcost); } - // 当某个模式下的残差为全零时,跳过所有后续dmh模式 + // 褰撴煇涓ā寮忎笅鐨勬畫宸负鍏ㄩ浂鏃讹紝璺宠繃鎵鏈夊悗缁璬mh妯″紡 for (; dmh_mode < max_dmh_mode && !h->lcu.bypass_all_dmh; dmh_mode++) { if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) { if (dmh_mode != 0 && dmh_mode != dmh_mode_candidate) { @@ -2259,7 +3095,7 @@ void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, in } } else { if (dmh_mode > (DMH_MODE_NUM - 1)) { - if (best_dmh_mode != (dmh_mode - (DMH_MODE_NUM - 1))) { // 只在同方向上扩展,其他跳过 + if (best_dmh_mode != (dmh_mode - (DMH_MODE_NUM - 1))) { // 鍙湪鍚屾柟鍚戜笂鎵╁睍锛屽叾浠栬烦杩 continue; } } @@ -2340,7 +3176,7 @@ typedef struct cu_skip_mc_t { } cu_skip_mc_t; /* --------------------------------------------------------------------------- - * 更新Skip的MV集,以检测当前模式的MV是否被遍历过 + * 鏇存柊Skip鐨凪V闆嗭紝浠ユ娴嬪綋鍓嶆ā寮忕殑MV鏄惁琚亶鍘嗚繃 */ static ALWAYS_INLINE int is_same_skip_mc_param(const cu_skip_mc_t *p_src1, const cu_skip_mc_t *p_src2) @@ -2360,7 +3196,7 @@ int is_same_skip_mc_param(const cu_skip_mc_t *p_src1, const cu_skip_mc_t *p_src2 } /* --------------------------------------------------------------------------- - * 更新Skip的MV集,以检测当前模式的MV是否被遍历过 + * 鏇存柊Skip鐨凪V闆嗭紝浠ユ娴嬪綋鍓嶆ā寮忕殑MV鏄惁琚亶鍘嗚繃 */ static int update_skip_mv_list(cu_skip_mc_t *p_skip_mvs, int i_num, cu_t *p_cu) @@ -2389,7 +3225,7 @@ int update_skip_mv_list(cu_skip_mc_t *p_skip_mvs, int i_num, cu_t *p_cu) } /* --------------------------------------------------------------------------- - * 检查Skip/Direct模式的编码代价(依据预测残差),选取最优的Skip子模式进行一次RDO + * 妫鏌kip/Direct妯″紡鐨勭紪鐮佷唬浠凤紙渚濇嵁棰勬祴娈嬪樊锛夛紝閫夊彇鏈浼樼殑Skip瀛愭ā寮忚繘琛屼竴娆DO */ static void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost) @@ -2398,9 +3234,7 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu int num_mc_params = 0; int max_skip_mode_num, i; int cu_size = p_cu->i_size; - pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)]; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); - pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; dist_t distortion; rdcost_t rdcost; rdcost_t min_rdcost = MAX_COST; @@ -2423,10 +3257,158 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; - /* 时域MVP预测的直接算RDCost,再跟空域的最优的RDCost做比较,增益 3%左右,时间增加 20%~30% */ + /* 鏃跺煙MVP棰勬祴鐨勭洿鎺ョ畻RDCost锛屽啀璺熺┖鍩熺殑鏈浼樼殑RDCost鍋氭瘮杈冿紝澧炵泭 3%宸﹀彸锛屾椂闂村鍔 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + if (h->param->input_sample_bit_depth == 8) { + pixel8_ssd_t cmp_skip = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)]; + pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; + /* 2, Weighted skip mode, derive MV from temporal and scaling */ + for (i = 1; i < max_skip_mode_num; i++) { + int need_check_mv; + p_cu->cu_info.directskip_wsm_idx = (int8_t)i; + cu_set_mvs_skip(h, p_cu); + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = p_aec->binary.est_cu_header(h, p_aec, p_cu); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = i; + } + } + } + + /* 3, 鍥涗釜spatial direct绫诲瀷 (single first, single second, dual first, dual second) */ + if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + int need_check_mv; + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[4+i]; + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = 0; + best_skip_mode = i; + } + } + } + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ + p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; + p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } else if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (h->fdec->rps.poc == 2 || h->fdec->rps.poc == 6)) { + if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) { + if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 || p_cu->p_topA_cu->i_cbp == 0 || p_cu->p_topL_cu->i_cbp == 0 || p_cu->p_topR_cu->i_cbp == 0)) { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + int need_check_mv; + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[4 + i]; + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = 0; + best_skip_mode = i; + } + } + } + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ + p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; + p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + + } else { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } + } + } else { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } + } + } else if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { + if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) { + if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 && p_cu->p_topA_cu->i_cbp == 0 && p_cu->p_topL_cu->i_cbp == 0 && p_cu->p_topR_cu->i_cbp == 0)) { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + int need_check_mv; + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[4 + i]; + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = 0; + best_skip_mode = i; + } + } + } + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ + p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; + p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + + } else { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } + } + } else { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } + } + } + } else { + pixel10_ssd_t cmp_skip = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)]; + pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; /* 2, Weighted skip mode, derive MV from temporal and scaling */ for (i = 1; i < max_skip_mode_num; i++) { int need_check_mv; @@ -2437,17 +3419,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = p_aec->binary.est_cu_header(h, p_aec, p_cu); - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = i; } } } - /* 3, 四个spatial direct类型 (single first, single second, dual first, dual second) */ + /* 3, 鍥涗釜spatial direct绫诲瀷 (single first, single second, dual first, dual second) */ if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { @@ -2459,17 +3441,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4+i]; - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } - /* 在distortion最小的模式中选择一个最优的 */ + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); @@ -2487,17 +3469,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i]; - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } - /* 在distortion最小的模式中选择一个最优的 */ + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); @@ -2534,17 +3516,17 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i]; - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } - /* 在distortion最小的模式中选择一个最优的 */ + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); @@ -2569,6 +3551,7 @@ void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu } } } + } } static @@ -2578,9 +3561,7 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu int num_mc_params = 0; int max_skip_mode_num, i; int cu_size = p_cu->i_size; - pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)]; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); - pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; dist_t distortion; rdcost_t rdcost; rdcost_t min_rdcost = MAX_COST; @@ -2603,15 +3584,80 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; - /* 时域MVP预测的直接算RDCost,再跟空域的最优的RDCost做比较,增益 3%左右,时间增加 20%~30% */ + /* 鏃跺煙MVP棰勬祴鐨勭洿鎺ョ畻RDCost锛屽啀璺熺┖鍩熺殑鏈浼樼殑RDCost鍋氭瘮杈冿紝澧炵泭 3%宸﹀彸锛屾椂闂村鍔 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); num_mc_params += update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + + if (h->param->input_sample_bit_depth == 8) { + pixel8_ssd_t cmp_skip = g_funcs.pixf.sa8d8[PART_INDEX(cu_size, cu_size)]; + pel8_t *p_fenc = h->lcu.p_fenc8[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; + if (rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[0];//p_aec->binary.est_cu_header(h, p_aec, p_cu); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + min_rdcost = distortion + h->f_lambda_mode * rate; + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + best_weighted_skip = 0; + best_skip_mode = DS_NONE; + } + + /* 2, Weighted skip mode, derive MV from temporal and scaling */ + for (i = 1; i < max_skip_mode_num; i++) { + int need_check_mv; + p_cu->cu_info.directskip_wsm_idx = (int8_t)i; + cu_set_mvs_skip(h, p_cu); + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = i; + } + } + } + + /* 3, 鍥涗釜spatial direct绫诲瀷 (single first, single second, dual first, dual second) */ + if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { + p_cu->cu_info.directskip_wsm_idx = 0; + for (i = 0; i < DS_MAX_NUM; i++) { + int need_check_mv; + p_cu->cu_info.directskip_mhp_idx = (int8_t)i; + cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); + cu_set_mvs_skip(h, p_cu); + need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); + num_mc_params += need_check_mv; + if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { + rate = headerbits_skipmode[4 + i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter8, FREC_STRIDE); + rdcost = distortion + h->f_lambda_mode * rate; + if (rdcost < min_rdcost) { + XAVS2_SWAP_PTR(p_layer->buf_pred_inter8, p_layer->buf_pred_inter8_best); + min_rdcost = rdcost; + best_weighted_skip = 0; + best_skip_mode = i; + } + } + } + } + + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ + p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; + p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; + cu_set_mvs_skip(h, p_cu); + cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } else { + pixel10_ssd_t cmp_skip = g_funcs.pixf.sa8d10[PART_INDEX(cu_size, cu_size)]; + pel10_t *p_fenc = h->lcu.p_fenc10[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; if (rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[0];//p_aec->binary.est_cu_header(h, p_aec, p_cu); - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); min_rdcost = distortion + h->f_lambda_mode * rate; - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); best_weighted_skip = 0; best_skip_mode = DS_NONE; } @@ -2626,17 +3672,17 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = i; } } } - /* 3, 四个spatial direct类型 (single first, single second, dual first, dual second) */ + /* 3, 鍥涗釜spatial direct绫诲瀷 (single first, single second, dual first, dual second) */ if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { @@ -2648,10 +3694,10 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); - distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); + distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter10, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { - XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); + XAVS2_SWAP_PTR(p_layer->buf_pred_inter10, p_layer->buf_pred_inter10_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; @@ -2660,16 +3706,17 @@ void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu } } - /* 在distortion最小的模式中选择一个最优的 */ + /* 鍦╠istortion鏈灏忕殑妯″紡涓夋嫨涓涓渶浼樼殑 */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); + } } /* --------------------------------------------------------------------------- - * 检查Skip/Direct模式的编码代价(RDO),选取最优的Skip子模式 + * 妫鏌kip/Direct妯″紡鐨勭紪鐮佷唬浠凤紙RDO锛夛紝閫夊彇鏈浼樼殑Skip瀛愭ā寮 */ static void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost) @@ -2690,7 +3737,7 @@ void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, c p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; - /* 时域MVP预测的直接算RDCost,再跟空域的最优的RDCost做比较,增益 3%左右,时间增加 20%~30% */ + /* 鏃跺煙MVP棰勬祴鐨勭洿鎺ョ畻RDCost锛屽啀璺熺┖鍩熺殑鏈浼樼殑RDCost鍋氭瘮杈冿紝澧炵泭 3%宸﹀彸锛屾椂闂村鍔 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); @@ -2701,7 +3748,7 @@ void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, c cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } - /* 3, 四个spatial direct类型 (single first, single second, dual first, dual second) */ + /* 3, 鍥涗釜spatial direct绫诲瀷 (single first, single second, dual first, dual second) */ if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { @@ -2984,7 +4031,11 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best h->lcu.b_enable_rdoq = (h->param->i_rdoq_level == RDOQ_ALL); h->lcu.b_2nd_rdcost_pass = 1; - h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma; + if (h->param->input_sample_bit_depth == 8) { + h->lcu.get_intra_dir_for_rdo_luma8 = h->get_intra_candidates_luma8; + } else { + h->lcu.get_intra_dir_for_rdo_luma10 = h->get_intra_candidates_luma10; + } //===== SET VALID MODES ===== intra_modes = cu_get_valid_modes(h, h->i_type, i_level); @@ -2997,11 +4048,11 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best //===== GET BEST MACROBLOCK MODE ===== for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) { if (!(intra_modes & (1 << mode))) { - continue; // 直接跳过不可用模式 + continue; // 鐩存帴璺宠繃涓嶅彲鐢ㄦā寮 } if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) { - // 最后一个非对称帧内模式的提前跳过 + // 鏈鍚庝竴涓潪瀵圭О甯у唴妯″紡鐨勬彁鍓嶈烦杩 if (sdip_early_bypass(h, p_layer, mode)) { continue; } @@ -3013,9 +4064,13 @@ rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } - /* 检查最优模式,带RDOQ */ + /* 妫鏌ユ渶浼樻ā寮忥紝甯DOQ */ if (h->param->i_rdoq_level == RDOQ_CU_LEVEL && best->i_cbp > 0) { - h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; + if (h->param->input_sample_bit_depth == 8) { + h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass; + } else { + h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass; + } h->lcu.b_enable_rdoq = 1; mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); @@ -3043,10 +4098,14 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); /* ------------------------------------------------------------- - * 1, 初始化 + * 1, 鍒濆鍖 */ UNUSED_PARAMETER(cost_limit); - h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma; + if (h->param->input_sample_bit_depth == 8) { + h->lcu.get_intra_dir_for_rdo_luma8 = h->get_intra_candidates_luma8; + } else { + h->lcu.get_intra_dir_for_rdo_luma10 = h->get_intra_candidates_luma10; + } h->enable_tu_2level = IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) ? 0 : 2; h->lcu.b_enable_rdoq = (h->param->i_rdoq_level == RDOQ_ALL); h->lcu.b_2nd_rdcost_pass = 0; @@ -3056,12 +4115,12 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } /* reset chroma intra predictor to default */ - p_cu->cu_info.i_intra_mode_c = DC_PRED_C; // @luofl:请勿移除此行,否则会导致不匹配问题;20170304 19:52:32 + p_cu->cu_info.i_intra_mode_c = DC_PRED_C; // @luofl锛氳鍕跨Щ闄ゆ琛岋紝鍚﹀垯浼氬鑷翠笉鍖归厤闂锛20170304 19:52:32 /* ------------------------------------------------------------- - * 2, 检查Skip和Direct模式 + * 2, 妫鏌kip鍜孌irect妯″紡 */ - /* 检查所有SKIP/Direct子模式 */ + /* 妫鏌ユ墍鏈塖KIP/Direct瀛愭ā寮 */ p_cu->cu_info.i_mode = PRED_SKIP; if (IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL) && h->skip_rough_improved) { @@ -3087,15 +4146,15 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best /* ------------------------------------------------------------- - * 3, 非Skip/Direct的帧间模式 + * 3, 闈濻kip/Direct鐨勫抚闂存ā寮 */ for (mode = 1; mode < MAX_INTER_MODES; mode++) { if (!(avail_modes & (1 << mode))) { - continue; // 直接跳过不可用模式的决策 + continue; // 鐩存帴璺宠繃涓嶅彲鐢ㄦā寮忕殑鍐崇瓥 } /* ------------------------------------------------------------- - * 3.1 与Skip/Direct模式相关的快速模式决策算法放在此处 + * 3.1 涓嶴kip/Direct妯″紡鐩稿叧鐨勫揩閫熸ā寮忓喅绛栫畻娉曟斁鍦ㄦ澶 */ #if SAVE_CU_INFO @@ -3107,8 +4166,8 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } #endif - /* 快速PU划分模式决策: - * 如果P2NxN未获得最优,直接跳过相同划分方向的PRED_2NxnU/PRED_2NxnD; PNx2N同理 */ + /* 蹇烶U鍒掑垎妯″紡鍐崇瓥锛 + * 濡傛灉P2NxN鏈幏寰楁渶浼橈紝鐩存帴璺宠繃鐩稿悓鍒掑垎鏂瑰悜鐨凱RED_2NxnU/PRED_2NxnD; PNx2N鍚岀悊 */ if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) { if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best->i_mode != PRED_2NxN) { continue; @@ -3119,7 +4178,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best /* ------------------------------------------------------------- - * 3.2, 尝试编码当前PU划分模式 + * 3.2, 灏濊瘯缂栫爜褰撳墠PU鍒掑垎妯″紡 */ p_cu->cu_info.i_mode = (int8_t)mode; if (IS_ALG_ENABLE(OPT_ROUGH_PU_SEL) && mode == PRED_2Nx2N) { @@ -3127,15 +4186,15 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best cu_select_inter_partition(h, p_cu, i_level, avail_modes, &cur_best, &min_rdcost, b_dhp_enabled, b_check_dmh); mode = cur_best.i_mode; cu_copy_info(&p_cu->cu_info, &cur_best); - memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc_tmp, sizeof(p_cu->mc)); /* 拷贝MV信息用于补偿 */ + memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc_tmp, sizeof(p_cu->mc)); /* 鎷疯礉MV淇℃伅鐢ㄤ簬琛ュ伩 */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); - avail_modes &= ~0xfe; // 禁用掉剩余帧间划分模式 + avail_modes &= ~0xfe; // 绂佺敤鎺夊墿浣欏抚闂村垝鍒嗘ā寮 } else { cu_check_inter_partition(h, p_aec, p_cu, mode, i_level, best, &min_rdcost, b_dhp_enabled, b_check_dmh); } /* ------------------------------------------------------------- - * 3.3, 当前普通PU划分模式编码后的快速决策算法 + * 3.3, 褰撳墠鏅歅U鍒掑垎妯″紡缂栫爜鍚庣殑蹇熷喅绛栫畻娉 */ if (best->i_mode == mode) { @@ -3165,29 +4224,29 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } } - /* 做第二层TU划分,选出最优模式 */ + /* 鍋氱浜屽眰TU鍒掑垎锛岄夊嚭鏈浼樻ā寮 */ if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) && best->i_cbp > 0) { h->enable_tu_2level = 1; mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); - memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 拷贝MV信息用于补偿 */ + memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 鎷疯礉MV淇℃伅鐢ㄤ簬琛ュ伩 */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); }// end of checking inter PU partitions - /* 通过帧级预分析判定,此帧不需要做帧内预测时,跳过后续帧内模式 */ + /* 閫氳繃甯х骇棰勫垎鏋愬垽瀹氾紝姝ゅ抚涓嶉渶瑕佸仛甯у唴棰勬祴鏃讹紝璺宠繃鍚庣画甯у唴妯″紡 */ if (!h->fenc->b_enable_intra) { b_bypass_intra = 1; } if (IS_ALG_ENABLE(OPT_BYPASS_INTRA_BPIC)) { - b_bypass_intra |= (h->i_type == SLICE_TYPE_B && best->i_cbp == 0); // 禁用B帧的帧内预测模式 + b_bypass_intra |= (h->i_type == SLICE_TYPE_B && best->i_cbp == 0); // 绂佺敤B甯х殑甯у唴棰勬祴妯″紡 } - /* 条件禁用部分帧内划分模式 */ + /* 鏉′欢绂佺敤閮ㄥ垎甯у唴鍒掑垎妯″紡 */ if (IS_ALG_ENABLE(OPT_CMS_ETMD)) { - /* 帧间模式做完之后,若最优模式的CBP为零,则不再遍历所有帧内预测模式 */ + /* 甯ч棿妯″紡鍋氬畬涔嬪悗锛岃嫢鏈浼樻ā寮忕殑CBP涓洪浂锛屽垯涓嶅啀閬嶅巻鎵鏈夊抚鍐呴娴嬫ā寮 */ b_bypass_intra |= ((best->i_cbp == 0) && (best->i_mode == 0)); - /* 依据帧间最优划分模式,筛选不需要遍历的模式 */ + /* 渚濇嵁甯ч棿鏈浼樺垝鍒嗘ā寮忥紝绛涢変笉闇瑕侀亶鍘嗙殑妯″紡 */ // if (IS_HOR_PU_PART(best->i_mode)) { // avail_modes &= !(1 << PRED_I_nx2N); // } else if (IS_VER_PU_PART(best->i_mode)) { @@ -3207,7 +4266,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } } - /* 若当前最小RDCost小于了某个阈值,表明帧间预测模式已经能够较好地预测,此时不再继续尝试帧内模式 */ + /* 鑻ュ綋鍓嶆渶灏廟DCost灏忎簬浜嗘煇涓槇鍊硷紝琛ㄦ槑甯ч棿棰勬祴妯″紡宸茬粡鑳藉杈冨ソ鍦伴娴嬶紝姝ゆ椂涓嶅啀缁х画灏濊瘯甯у唴妯″紡 */ if (IS_ALG_ENABLE(OPT_FAST_INTRA_IN_INTER) && min_rdcost < h->thres_qsfd_cu[1][i_level - MIN_CU_SIZE_IN_BIT]) { b_bypass_intra = 1; } @@ -3218,11 +4277,11 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best if (!b_bypass_intra) { for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) { if (!(avail_modes & (1 << mode))) { - continue; // 直接跳过不可用模式的决策 + continue; // 鐩存帴璺宠繃涓嶅彲鐢ㄦā寮忕殑鍐崇瓥 } if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) { - // 最后一个非对称帧内模式的提前跳过 + // 鏈鍚庝竴涓潪瀵圭О甯у唴妯″紡鐨勬彁鍓嶈烦杩 if (sdip_early_bypass(h, p_layer, mode)) { continue; } @@ -3242,14 +4301,18 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best } } - /* 检查最优模式,包括TU划分还是不划分的确定,带RDOQ */ + /* 妫鏌ユ渶浼樻ā寮,鍖呮嫭TU鍒掑垎杩樻槸涓嶅垝鍒嗙殑纭畾锛屽甫RDOQ */ if (h->param->i_rdoq_level == RDOQ_CU_LEVEL&& best->i_cbp > 0) { if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC)) { h->enable_tu_2level = 3; } else { h->enable_tu_2level = 2; } - h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; + if (h->param->input_sample_bit_depth == 8) { + h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass; + } else { + h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass; + } h->lcu.b_enable_rdoq = 1; h->lcu.b_2nd_rdcost_pass = 1; mode = best->i_mode; @@ -3259,12 +4322,16 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } } else { - memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 拷贝MV信息用于补偿 */ + memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 鎷疯礉MV淇℃伅鐢ㄤ簬琛ュ伩 */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); } } else if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && i_level >= 5 && (best->i_mode != PRED_SKIP || best->i_cbp != 0)) { h->enable_tu_2level = 2; - h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; + if (h->param->input_sample_bit_depth == 8) { + h->lcu.get_intra_dir_for_rdo_luma8 = rdo_get_pred_intra_luma8_2nd_pass; + } else { + h->lcu.get_intra_dir_for_rdo_luma10 = rdo_get_pred_intra_luma10_2nd_pass; + } h->lcu.b_2nd_rdcost_pass = 1; // recheck RDCost mode = best->i_mode; @@ -3272,7 +4339,7 @@ rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best if (IS_INTRA_MODE(mode)) { cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } else { - memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 拷贝MV信息用于补偿 */ + memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* 鎷疯礉MV淇℃伅鐢ㄤ簬琛ュ伩 */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); } } @@ -3288,14 +4355,20 @@ int ctu_intra_depth_pred_mad(xavs2_t *h, int level, int pix_x, int pix_y) static const int MAD_TH0[] = { 2, 2 * 256, 2 * 1024, 3 * 4096 }; - pel_t *p_src_base = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; int cu_size = 1 << level; - int mad = g_funcs.pixf.madf[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size); + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_src_base = h->lcu.p_fenc8[0] + pix_y * FENC_STRIDE + pix_x; + int mad = g_funcs.pixf.madf8[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size); return mad >= MAD_TH0[level - MIN_CU_SIZE_IN_BIT]; -} + } else { + pel10_t *p_src_base = h->lcu.p_fenc10[0] + pix_y * FENC_STRIDE + pix_x; + int mad = g_funcs.pixf.madf10[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size); + return mad >= MAD_TH0[level - MIN_CU_SIZE_IN_BIT]; + } +} /** * =========================================================================== @@ -3304,7 +4377,7 @@ int ctu_intra_depth_pred_mad(xavs2_t *h, int level, int pix_x, int pix_y) */ /* --------------------------------------------------------------------------- - * RDOPT初始化时,设置不同帧和CU大小可用的模式,后续直接查表 + * RDOPT鍒濆鍖栨椂锛岃缃笉鍚屽抚鍜孋U澶у皬鍙敤鐨勬ā寮忥紝鍚庣画鐩存帴鏌ヨ〃 */ void xavs2_init_valid_mode_table(xavs2_t *h) { @@ -3466,7 +4539,7 @@ rdcost_t compress_ctu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, i b_split_ctu &= !is_ET_inter_recur(h, p_cu, best); } - /* 当前CU和上一层CU的最优模式均为SKIP模式,则跳过下层CU的划分 @张玉槐 */ + /* 褰撳墠CU鍜屼笂涓灞侰U鐨勬渶浼樻ā寮忓潎涓篠KIP妯″紡锛屽垯璺宠繃涓嬪眰CU鐨勫垝鍒 @寮犵帀妲 */ if (IS_ALG_ENABLE(OPT_CU_CSET) && ((p_cu->i_size <= 16 && h->i_type == SLICE_TYPE_B) || (p_cu->i_size <= 32 && h->fdec->rps.referd_by_others == 0))) { cu_layer_t *p_ulayer = cu_get_layer(h, i_level + 1); @@ -3524,7 +4597,7 @@ rdcost_t compress_ctu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, i if (IS_ALG_ENABLE(OPT_SUBCU_SPLIT)) { if ((p_cu->sub_cu[0] != NULL) && (p_cu->sub_cu[1] != NULL) && (p_cu->sub_cu[2] != NULL) && (p_cu->sub_cu[3] != NULL)) { if (((p_cu->sub_cu[0]->is_ctu_split + p_cu->sub_cu[1]->is_ctu_split + p_cu->sub_cu[2]->is_ctu_split + p_cu->sub_cu[3]->is_ctu_split) >= 3)) { - b_check_large_cu = FALSE; // 1080p 20% 节省,约1.7%损失,preset 6,1080p + b_check_large_cu = FALSE; // 1080p 20% 鑺傜渷锛岀害1.7%鎹熷け锛宲reset 6锛1080p } /* else if (((!p_cu->sub_cu[0]->is_ctu_split) && ((p_cu->sub_cu[0]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[0]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[0]->cu_info.i_cbp == 0))) && ((!p_cu->sub_cu[1]->is_ctu_split) && ((p_cu->sub_cu[1]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[1]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[1]->cu_info.i_cbp == 0))) diff --git a/source/encoder/sao.c b/source/encoder/sao.c index f47d676..6ffdd13 100644 --- a/source/encoder/sao.c +++ b/source/encoder/sao.c @@ -41,7 +41,9 @@ #include "filter.h" #include "cpu.h" #include "cudata.h" +#if HAVE_MMX #include "vec/intrinsic.h" +#endif static const int tab_sao_check_mode_fast[3][5] = { 1, 1, 0, 0, 0, @@ -65,7 +67,7 @@ static ALWAYS_INLINE void sao_init_stat_data(SAOStatData *p_stats) /* --------------------------------------------------------------------------- */ static -void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +void sao_get_stat_block_EO_0(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; @@ -81,10 +83,11 @@ void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; - const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; - const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; - const pel_t *p_org_iter; - const pel_t *p_rec_iter; + if (h->param->input_sample_bit_depth == 8) { + const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x; + const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x; + const pel8_t *p_org_iter; + const pel8_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; @@ -106,12 +109,39 @@ void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, p_rec_iter += i_rec; p_org_iter += i_org; } + } else { + const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x; + const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x; + const pel10_t *p_org_iter; + const pel10_t *p_rec_iter;\ + sao_init_stat_data(p_stats); + p_org_iter = p_org; + p_rec_iter = p_rec; + start_y = 0; + end_y = height; + start_x = p_region->b_left ? 0 : 1; + end_x = p_region->b_right ? width : (width - 1); + p_org_iter = p_org + start_y * i_org; + p_rec_iter += start_y * i_rec; + for (y = start_y; y < end_y; y++) { + leftsign = xavs2_sign3(p_rec_iter[start_x] - p_rec_iter[start_x - 1]); + for (x = start_x; x < end_x; x++) { + rightsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1]); + edgetype = leftsign + rightsign; + leftsign = -rightsign; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + } + p_rec_iter += i_rec; + p_org_iter += i_org; + } + } } /* --------------------------------------------------------------------------- */ static -void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +void sao_get_stat_block_EO_90(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; @@ -127,10 +157,11 @@ void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; - const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; - const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; - const pel_t *p_org_iter; - const pel_t *p_rec_iter; + if (h->param->input_sample_bit_depth == 8) { + const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x; + const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x; + const pel8_t *p_org_iter; + const pel8_t *p_rec_iter; sao_init_stat_data(p_stats); @@ -150,12 +181,37 @@ void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, p_stats->count[edgetype + 2]++; } } + } else { + const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x; + const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x; + const pel10_t *p_org_iter; + const pel10_t *p_rec_iter; + + sao_init_stat_data(p_stats); + + p_org_iter = p_org; + p_rec_iter = p_rec; + start_x = 0; + end_x = width; + start_y = p_region->b_top ? 0 : 1; + end_y = p_region->b_down ? height : (height - 1); + for (x = start_x; x < end_x; x++) { + upsign = xavs2_sign3(p_rec_iter[start_y * i_rec + x] - p_rec_iter[(start_y - 1) * i_rec + x]); + for (y = start_y; y < end_y; y++) { + downsign = xavs2_sign3(p_rec_iter[y * i_rec + x] - p_rec_iter[(y + 1) * i_rec + x]); + edgetype = downsign + upsign; + upsign = -downsign; + p_stats->diff[edgetype + 2] += (p_org_iter[y * i_org + x] - p_rec_iter[y * i_rec + x]); + p_stats->count[edgetype + 2]++; + } + } + } } /* --------------------------------------------------------------------------- */ static -void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +void sao_get_stat_block_EO_135(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; @@ -173,10 +229,71 @@ void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; - const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; - const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; - const pel_t *p_org_iter; - const pel_t *p_rec_iter; + if (h->param->input_sample_bit_depth == 8) { + const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x; + const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x; + const pel8_t *p_org_iter; + const pel8_t *p_rec_iter; + + sao_init_stat_data(p_stats); + + p_org_iter = p_org; + p_rec_iter = p_rec; + start_x_r0 = p_region->b_top_left ? 0 : 1; + end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1; + start_x_r = p_region->b_left ? 0 : 1; + end_x_r = p_region->b_right ? width : (width - 1); + start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1); + end_x_rn = p_region->b_right_down ? width : (width - 1); + + // init the line buffer + for (x = start_x_r + 1; x < end_x_r + 1; x++) { + upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x - 1]); + signupline[x] = upsign; + } + // first row + for (x = start_x_r0; x < end_x_r0; x++) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); + edgetype = upsign - signupline[x + 1]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + } + + // middle rows + p_rec_iter += i_rec; + p_org_iter += i_org; + for (y = 1; y < height - 1; y++) { + for (x = start_x_r; x < end_x_r; x++) { + if (x == start_x_r) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); + signupline[x] = upsign; + } + downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]); + edgetype = downsign + signupline[x]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + signupline[x] = (char)reg; + reg = -downsign; + } + p_rec_iter += i_rec; + p_org_iter += i_org; + } + // last row + for (x = start_x_rn; x < end_x_rn; x++) { + if (x == start_x_r) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); + signupline[x] = upsign; + } + downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]); + edgetype = downsign + signupline[x]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + } + } else { + const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x; + const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x; + const pel10_t *p_org_iter; + const pel10_t *p_rec_iter; sao_init_stat_data(p_stats); @@ -232,12 +349,13 @@ void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } + } } /* --------------------------------------------------------------------------- */ static -void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +void sao_get_stat_block_EO_45(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; @@ -255,10 +373,11 @@ void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; - const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; - const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; - const pel_t *p_org_iter; - const pel_t *p_rec_iter; + if (h->param->input_sample_bit_depth == 8) { + const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x; + const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x; + const pel8_t *p_org_iter; + const pel8_t *p_rec_iter; sao_init_stat_data(p_stats); @@ -313,12 +432,72 @@ void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } + } else { + const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x; + const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x; + const pel10_t *p_org_iter; + const pel10_t *p_rec_iter; + + sao_init_stat_data(p_stats); + + p_org_iter = p_org; + p_rec_iter = p_rec; + start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1); + end_x_r0 = p_region->b_top_right ? width : (width - 1); + start_x_r = p_region->b_left ? 0 : 1; + end_x_r = p_region->b_right ? width : (width - 1); + start_x_rn = p_region->b_down_left ? 0 : 1; + end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1; + + // init the line buffer + signupline1 = signupline + 1; + for (x = start_x_r - 1; x < XAVS2_MAX(end_x_r - 1, end_x_r0 - 1); x++) { + upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x + 1]); + signupline1[x] = upsign; + } + // first row + for (x = start_x_r0; x < end_x_r0; x++) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); + edgetype = upsign - signupline1[x - 1]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + } + + // middle rows + p_rec_iter += i_rec; + p_org_iter += i_org; + for (y = 1; y < height - 1; y++) { + for (x = start_x_r; x < end_x_r; x++) { + if (x == end_x_r - 1) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); + signupline1[x] = upsign; + } + downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]); + edgetype = downsign + signupline1[x]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + signupline1[x - 1] = -downsign; + } + p_rec_iter += i_rec; + p_org_iter += i_org; + } + for (x = start_x_rn; x < end_x_rn; x++) { + if (x == end_x_r - 1) { + upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); + signupline1[x] = upsign; + } + downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]); + edgetype = downsign + signupline1[x]; + p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[edgetype + 2]++; + } + } } /* --------------------------------------------------------------------------- */ static -void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +void sao_get_stat_block_BO(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; @@ -334,16 +513,41 @@ void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; - const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; - const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; - const pel_t *p_org_iter; - const pel_t *p_rec_iter; + if (h->param->input_sample_bit_depth == 8) { + const pel8_t *p_rec = frm_rec->planes8[compIdx] + pix_y * i_rec + pix_x; + const pel8_t *p_org = frm_org->planes8[compIdx] + pix_y * i_org + pix_x; + const pel8_t *p_org_iter; + const pel8_t *p_rec_iter; + + sao_init_stat_data(p_stats); + + p_org_iter = p_org; + p_rec_iter = p_rec; + band_shift = (h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); + start_x = 0; + end_x = width; + start_y = 0; + end_y = height; + for (y = start_y; y < end_y; y++) { + for (x = start_x; x < end_x; x++) { + bandtype = p_rec_iter[x] >> band_shift; + p_stats->diff[bandtype] += (p_org_iter[x] - p_rec_iter[x]); + p_stats->count[bandtype]++; + } + p_rec_iter += i_rec; + p_org_iter += i_org; + } + } else { + const pel10_t *p_rec = frm_rec->planes10[compIdx] + pix_y * i_rec + pix_x; + const pel10_t *p_org = frm_org->planes10[compIdx] + pix_y * i_org + pix_x; + const pel10_t *p_org_iter; + const pel10_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; - band_shift = (g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); + band_shift = (h->param->input_sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); start_x = 0; end_x = width; start_y = 0; @@ -357,11 +561,12 @@ void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, p_rec_iter += i_rec; p_org_iter += i_org; } + } } /* --------------------------------------------------------------------------- */ -typedef void(*sao_pf)(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, +typedef void(*sao_pf)(xavs2_t *h, xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *stat_datas, sao_region_t *p_region, int compIdx); sao_pf gf_sao_stat[5] = { @@ -517,7 +722,7 @@ static void find_offset(int typeIdc, SAOStatData *p_stat, SAOBlkParam *p_param, start_band2 = XAVS2_MAX(best_start_band1, best_start_band2); delta_band12 = (start_band2 - start_band1); if (delta_band12 > (NUM_SAO_BO_CLASSES >> 1)) { - p_param->deltaBand = 32 - delta_band12; // TODO: 这里应该是 (32 + delta_band12) + p_param->deltaBand = 32 - delta_band12; // TODO: 杩欓噷搴旇鏄 (32 + delta_band12) p_param->startBand = start_band2; } else { p_param->deltaBand = delta_band12; @@ -733,8 +938,168 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s int i_src = h->img_sao->i_stride[compIdx]; int i_dst = h->fdec->i_stride[compIdx]; - pel_t *dst = h->fdec->planes[compIdx] + pix_y * i_dst + pix_x; - pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *dst = h->fdec->planes8[compIdx] + pix_y * i_dst + pix_x; + pel8_t *src = h->img_sao->planes8[compIdx] + pix_y * i_src + pix_x; + + assert(blk_param->typeIdc != SAO_TYPE_OFF); + + switch (blk_param->typeIdc) { + case SAO_TYPE_EO_0: + end_y = height; + start_x = p_region->b_left ? 0 : 1; + end_x = p_region->b_right ? width : (width - 1); + for (y = 0; y < end_y; y++) { + leftsign = xavs2_sign3(src[start_x] - src[start_x - 1]); + for (x = start_x; x < end_x; x++) { + rightsign = xavs2_sign3(src[x] - src[x + 1]); + edgetype = leftsign + rightsign; + leftsign = -rightsign; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + } + src += i_src; + dst += i_dst; + } + break; + case SAO_TYPE_EO_90: { + pel8_t *src_base = src; + pel8_t *dst_base = dst; + start_x = 0; + end_x = width; + start_y = p_region->b_top ? 0 : 1; + end_y = p_region->b_down ? height : (height - 1); + + src_base += start_y * i_src; + dst_base += start_y * i_dst; + for (x = start_x; x < end_x; x++) { + src = src_base; + dst = dst_base; + upsign = xavs2_sign3(src[0] - src[-i_src]); + for (y = start_y; y < end_y; y++) { + downsign = xavs2_sign3(src[0] - src[i_src]); + edgetype = downsign + upsign; + upsign = -downsign; + *dst = (pel8_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]); + src += i_src; + dst += i_dst; + } + src_base++; + dst_base++; + } + break; + } + case SAO_TYPE_EO_135: { + start_x_r0 = p_region->b_top_left ? 0 : 1; + end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1; + start_x_r = p_region->b_left ? 0 : 1; + end_x_r = p_region->b_right ? width : (width - 1); + start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1); + end_x_rn = p_region->b_right_down ? width : (width - 1); + + // init the line buffer + for (x = start_x_r + 1; x < end_x_r + 1; x++) { + signupline[x] = xavs2_sign3(src[x + i_src] - src[x - 1]); + } + // first row + for (x = start_x_r0; x < end_x_r0; x++) { + upsign = xavs2_sign3(src[x] - src[x - 1 - i_src]); + edgetype = upsign - signupline[x + 1]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + } + // middle rows + src += i_src; + dst += i_dst; + for (y = 1; y < height - 1; y++) { + x = start_x_r; + signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]); + for (; x < end_x_r; x++) { + downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); + edgetype = downsign + signupline[x]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + signupline[x] = reg; + reg = -downsign; + } + dst += i_dst; + src += i_src; + } + // last row + x = start_x_rn; + signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]); + for (; x < end_x_rn; x++) { + downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); + edgetype = downsign + signupline[x]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + } + } + break; + case SAO_TYPE_EO_45: { + start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1); + end_x_r0 = p_region->b_top_right ? width : (width - 1); + start_x_r = p_region->b_left ? 0 : 1; + end_x_r = p_region->b_right ? width : (width - 1); + start_x_rn = p_region->b_down_left ? 0 : 1; + end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1; + + // init the line buffer + for (x = start_x_r; x < end_x_r; x++) { + signupline[x] = xavs2_sign3(src[x - 1 + i_src] - src[x]); + } + // first row + for (x = start_x_r0; x < end_x_r0; x++) { + upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]); + edgetype = upsign - signupline[x]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + } + // middle rows + src += i_src; + dst += i_dst; + for (y = 1; y < height - 1; y++) { + signupline[end_x_r] = xavs2_sign3(src[end_x_r - 1] - src[end_x_r - i_src]); + for (x = start_x_r; x < end_x_r; x++) { + downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); + edgetype = downsign + signupline[x + 1]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + signupline[x] = -downsign; + } + src += i_src; + dst += i_dst; + } + //last row + for (x = start_x_rn; x < end_x_rn; x++) { + if (x == end_x_r - 1) { + upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]); + signupline[x + 1] = upsign; + } + downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); + edgetype = downsign + signupline[x + 1]; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + } + break; + } + case SAO_TYPE_BO: + band_shift = (h->param->sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); + start_x = 0; + end_x = width; + start_y = 0; + end_y = height; + src += start_y * i_src; + dst += start_y * i_dst; + for (y = start_y; y < end_y; y++) { + for (x = start_x; x < end_x; x++) { + bandtype = src[x] >> band_shift; + dst[x] = (pel8_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]); + } + src += i_src; + dst += i_dst; + } + break; + default: + xavs2_log(h, XAVS2_LOG_ERROR, "Not a supported SAO types for SAO_on_Block\n"); + exit(-1); + } + } else { + pel10_t *dst = h->fdec->planes10[compIdx] + pix_y * i_dst + pix_x; + pel10_t *src = h->img_sao->planes10[compIdx] + pix_y * i_src + pix_x; assert(blk_param->typeIdc != SAO_TYPE_OFF); @@ -749,15 +1114,15 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s rightsign = xavs2_sign3(src[x] - src[x + 1]); edgetype = leftsign + rightsign; leftsign = -rightsign; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } src += i_src; dst += i_dst; } break; case SAO_TYPE_EO_90: { - pel_t *src_base = src; - pel_t *dst_base = dst; + pel10_t *src_base = src; + pel10_t *dst_base = dst; start_x = 0; end_x = width; start_y = p_region->b_top ? 0 : 1; @@ -773,7 +1138,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s downsign = xavs2_sign3(src[0] - src[i_src]); edgetype = downsign + upsign; upsign = -downsign; - *dst = (pel_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]); + *dst = (pel10_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]); src += i_src; dst += i_dst; } @@ -798,7 +1163,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(src[x] - src[x - 1 - i_src]); edgetype = upsign - signupline[x + 1]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } // middle rows src += i_src; @@ -809,7 +1174,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (; x < end_x_r; x++) { downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); edgetype = downsign + signupline[x]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); signupline[x] = reg; reg = -downsign; } @@ -822,7 +1187,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (; x < end_x_rn; x++) { downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); edgetype = downsign + signupline[x]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } } break; @@ -842,7 +1207,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]); edgetype = upsign - signupline[x]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } // middle rows src += i_src; @@ -852,7 +1217,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (x = start_x_r; x < end_x_r; x++) { downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); edgetype = downsign + signupline[x + 1]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); signupline[x] = -downsign; } src += i_src; @@ -866,7 +1231,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s } downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); edgetype = downsign + signupline[x + 1]; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } break; } @@ -881,7 +1246,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s for (y = start_y; y < end_y; y++) { for (x = start_x; x < end_x; x++) { bandtype = src[x] >> band_shift; - dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]); + dst[x] = (pel10_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]); } src += i_src; dst += i_dst; @@ -891,6 +1256,7 @@ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, s xavs2_log(h, XAVS2_LOG_ERROR, "Not a supported SAO types for SAO_on_Block\n"); exit(-1); } + } } /* --------------------------------------------------------------------------- @@ -907,7 +1273,7 @@ static void sao_get_neighbor_avail(xavs2_t *h, sao_region_t *p_avail, int i_lcu_ int width_c = width >> 1; int height_c = height >> 1; - /* 可用性获取 */ + /* 鍙敤鎬ц幏鍙 */ p_avail->b_left = i_lcu_x != 0; p_avail->b_top = i_lcu_y != 0; p_avail->b_right = (i_lcu_x < h->i_width_in_lcu - 1); @@ -928,7 +1294,7 @@ static void sao_get_neighbor_avail(xavs2_t *h, sao_region_t *p_avail, int i_lcu_ p_avail->b_down_left = p_avail->b_down && p_avail->b_left; p_avail->b_right_down = p_avail->b_down && p_avail->b_right; - /* 滤波区域的调整 */ + /* 婊ゆ尝鍖哄煙鐨勮皟鏁 */ if (!p_avail->b_right) { width += SAO_SHIFT_PIX_NUM; width_c += SAO_SHIFT_PIX_NUM; @@ -1098,16 +1464,46 @@ void sao_copy_lcu(xavs2_t *h, xavs2_frame_t *frm_dst, xavs2_frame_t *frm_src, in int lcu_height; int i_first_lcu_y_for_filter = h->param->b_cross_slice_loop_filter ? 0 : h->slices[h->i_slice_index]->i_first_lcu_y; int start_y_shift = (lcu_y != i_first_lcu_y_for_filter) ? SAO_SHIFT_PIX_NUM : 0; - pel_t *p_src; - pel_t *p_dst; - pel_t *p_src2, *p_dst2; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *p_src; + pel8_t *p_dst; + pel8_t *p_src2, *p_dst2; + + /* luma component */ + start_y -= start_y_shift; + lcu_height = end_y - start_y; + p_src = frm_src->planes8[0] + start_y * i_src + start_x; + p_dst = frm_dst->planes8[0] + start_y * i_dst + start_x; + g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); + + /* chroma component */ + start_y = lcu_y << (h->i_lcu_level - CHROMA_V_SHIFT); + start_y -= start_y_shift; + end_y >>= CHROMA_V_SHIFT; + start_x >>= CHROMA_V_SHIFT; + end_x >>= CHROMA_V_SHIFT; + + lcu_width = end_x - start_x; + lcu_height = end_y - start_y; + i_src = frm_src->i_stride[1]; + i_dst = frm_dst->i_stride[1]; + p_src = frm_src->planes8[1] + start_y * i_src + start_x; + p_src2 = frm_src->planes8[2] + start_y * i_src + start_x; + p_dst = frm_dst->planes8[1] + start_y * i_dst + start_x; + p_dst2 = frm_dst->planes8[2] + start_y * i_dst + start_x; + g_funcs.plane_copy8(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); + g_funcs.plane_copy8(h, p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height); + } else { + pel10_t *p_src; + pel10_t *p_dst; + pel10_t *p_src2, *p_dst2; /* luma component */ start_y -= start_y_shift; lcu_height = end_y - start_y; - p_src = frm_src->planes[0] + start_y * i_src + start_x; - p_dst = frm_dst->planes[0] + start_y * i_dst + start_x; - g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); + p_src = frm_src->planes10[0] + start_y * i_src + start_x; + p_dst = frm_dst->planes10[0] + start_y * i_dst + start_x; + g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); /* chroma component */ start_y = lcu_y << (h->i_lcu_level - CHROMA_V_SHIFT); @@ -1120,12 +1516,13 @@ void sao_copy_lcu(xavs2_t *h, xavs2_frame_t *frm_dst, xavs2_frame_t *frm_src, in lcu_height = end_y - start_y; i_src = frm_src->i_stride[1]; i_dst = frm_dst->i_stride[1]; - p_src = frm_src->planes[1] + start_y * i_src + start_x; - p_src2 = frm_src->planes[2] + start_y * i_src + start_x; - p_dst = frm_dst->planes[1] + start_y * i_dst + start_x; - p_dst2 = frm_dst->planes[2] + start_y * i_dst + start_x; - g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); - g_funcs.plane_copy(p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height); + p_src = frm_src->planes10[1] + start_y * i_src + start_x; + p_src2 = frm_src->planes10[2] + start_y * i_src + start_x; + p_dst = frm_dst->planes10[1] + start_y * i_dst + start_x; + p_dst2 = frm_dst->planes10[2] + start_y * i_dst + start_x; + g_funcs.plane_copy10(h, p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); + g_funcs.plane_copy10(h, p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height); + } } /* --------------------------------------------------------------------------- @@ -1144,7 +1541,7 @@ void sao_get_lcu_param_after_deblock(xavs2_t *h, aec_t *p_aec, int i_lcu_x, int for (type = 0; type < 5; type++) { if (!h->param->b_fast_sao || tab_sao_check_mode_fast[compIdx][type]) { if (((!IS_ALG_ENABLE(OPT_FAST_SAO)) || (!(!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)))) { - gf_sao_stat[type](h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], ®ion, compIdx); + gf_sao_stat[type](h, h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], ®ion, compIdx); } // SAOStatData tmp; // memset(&tmp, 0, sizeof(tmp)); @@ -1182,8 +1579,9 @@ void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int l int pix_x = region.pix_x[compIdx]; int i_dst = h->fdec->i_stride[compIdx]; int i_src = h->img_sao->i_stride[compIdx]; - pel_t *dst = h->fdec->planes[compIdx] + pix_y * i_dst + pix_x; - pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x; + if (h->param->input_sample_bit_depth == 8) { + pel8_t *dst = h->fdec->planes8[compIdx] + pix_y * i_dst + pix_x; + pel8_t *src = h->img_sao->planes8[compIdx] + pix_y * i_src + pix_x; int avail[8]; avail[0] = region.b_top; avail[1] = region.b_down; @@ -1193,11 +1591,26 @@ void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int l avail[5] = region.b_top_right; avail[6] = region.b_down_left; avail[7] = region.b_right_down; - g_funcs.sao_block(dst, i_dst, src, i_src, + g_funcs.sao_block8(h, dst, i_dst, src, i_src, region.width[compIdx], region.height[compIdx], avail, &p_param[compIdx]); - - } + } else { + pel10_t *dst = h->fdec->planes10[compIdx] + pix_y * i_dst + pix_x; + pel10_t *src = h->img_sao->planes10[compIdx] + pix_y * i_src + pix_x; + int avail[8]; + avail[0] = region.b_top; + avail[1] = region.b_down; + avail[2] = region.b_left; + avail[3] = region.b_right; + avail[4] = region.b_top_left; + avail[5] = region.b_top_right; + avail[6] = region.b_down_left; + avail[7] = region.b_right_down; + g_funcs.sao_block10(h, dst, i_dst, src, i_src, + region.width[compIdx], region.height[compIdx], + avail, &p_param[compIdx]); + } + } } diff --git a/source/encoder/slice.c b/source/encoder/slice.c index 5ab94ce..d06da65 100644 --- a/source/encoder/slice.c +++ b/source/encoder/slice.c @@ -65,7 +65,7 @@ extern int g_bit_count; /* global bit count for trace */ /* --------------------------------------------------------------------------- - * 初始化LCU行的编码顺序 + * 鍒濆鍖朙CU琛岀殑缂栫爜椤哄簭 */ void slice_lcu_row_order_init(xavs2_t *h) { @@ -119,7 +119,7 @@ void slice_lcu_row_order_init(xavs2_t *h) p_slice = h->slices[idx_slice]; } } - } // 默认行级顺序 + } // 榛樿琛岀骇椤哄簭 } /* --------------------------------------------------------------------------- @@ -388,7 +388,11 @@ void *xavs2_lcu_row_write(void *arg) h->lcu.get_skip_mvs = g_funcs.get_skip_mv_predictors[h->i_type]; if (h->param->slice_num > 1) { - slice_init_bufer(h, slice); + if (h->param->input_sample_bit_depth == 8) { + slice_init_bufer8(h, slice); + } else { + slice_init_bufer10(h, slice); + } } /* loop over all LCUs in current lcu row ------------------------ @@ -588,7 +592,11 @@ void xavs2_slice_write_start(xavs2_t *h) aec_start(h, p_aec, slice->bs.p_start + PSEUDO_CODE_SIZE, slice->bs.p_end, 0); /* init slice buffers */ - slice_init_bufer(h, slice); + if (h->param->input_sample_bit_depth == 8) { + slice_init_bufer8(h, slice); + } else { + slice_init_bufer10(h, slice); + } /* prediction mode is set to -1 outside the frame, * indicating that no prediction can be made from this part */ diff --git a/source/encoder/slice.h b/source/encoder/slice.h index 5e7e2ba..af4d5ea 100644 --- a/source/encoder/slice.h +++ b/source/encoder/slice.h @@ -44,31 +44,42 @@ * =========================================================================== */ typedef struct slice_row_index_t { - int16_t lcu_y; /* 行编号 */ - int8_t slice_idx; /* 行所在的Slice索引号 */ - int8_t row_type; /* 0: Slice开始位置的行;1:普通;2: Slice结束位置的行 */ + int16_t lcu_y; /* 琛岀紪鍙 */ + int8_t slice_idx; /* 琛屾墍鍦ㄧ殑Slice绱㈠紩鍙 */ + int8_t row_type; /* 0: Slice寮濮嬩綅缃殑琛岋紱1:鏅氾紱2: Slice缁撴潫浣嶇疆鐨勮 */ } slice_row_index_t; extern slice_row_index_t g_slice_lcu_row_order[1024]; /* --------------------------------------------------------------------------- - * 初始化Slice级的buffer指针 + * 鍒濆鍖朣lice绾х殑buffer鎸囬拡 */ static ALWAYS_INLINE -void slice_init_bufer(xavs2_t *h, slice_t *slice) +void slice_init_bufer8(xavs2_t *h, slice_t *slice) { /* init slice buffers */ h->ipredmode = slice->slice_ipredmode; - h->intra_border[0] = slice->slice_intra_border[0]; - h->intra_border[1] = slice->slice_intra_border[1]; - h->intra_border[2] = slice->slice_intra_border[2]; + h->intra_border8[0] = slice->slice_intra_border8[0]; + h->intra_border8[1] = slice->slice_intra_border8[1]; + h->intra_border8[2] = slice->slice_intra_border8[2]; h->p_deblock_flag[0] = slice->slice_deblock_flag[0]; h->p_deblock_flag[1] = slice->slice_deblock_flag[1]; } +static ALWAYS_INLINE +void slice_init_bufer10(xavs2_t *h, slice_t *slice) +{ + /* init slice buffers */ + h->ipredmode = slice->slice_ipredmode; + h->intra_border10[0] = slice->slice_intra_border10[0]; + h->intra_border10[1] = slice->slice_intra_border10[1]; + h->intra_border10[2] = slice->slice_intra_border10[2]; + h->p_deblock_flag[0] = slice->slice_deblock_flag[0]; + h->p_deblock_flag[1] = slice->slice_deblock_flag[1]; +} /* --------------------------------------------------------------------------- - * 等待一行LCU编码完指定数量的LCU + * 绛夊緟涓琛孡CU缂栫爜瀹屾寚瀹氭暟閲忕殑LCU */ static ALWAYS_INLINE void wait_lcu_row_coded(row_info_t *last_row, int wait_lcu_coded) @@ -84,7 +95,7 @@ void wait_lcu_row_coded(row_info_t *last_row, int wait_lcu_coded) /* --------------------------------------------------------------------------- - * 查询一行LCU是否已编码完毕 + * 鏌ヨ涓琛孡CU鏄惁宸茬紪鐮佸畬姣 */ static ALWAYS_INLINE int is_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row) @@ -93,7 +104,7 @@ int is_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row) } /* --------------------------------------------------------------------------- - * 查询一行LCU是否已编码完毕 + * 鏌ヨ涓琛孡CU鏄惁宸茬紪鐮佸畬姣 */ static ALWAYS_INLINE void set_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row) @@ -114,8 +125,8 @@ void xavs2e_release_row_task(row_info_t *row) xavs2_handler_t *h_mgr = h->h_top; int b_slice_boundary_done = FALSE; - /* 如果此时Slice边界的相邻行已处理完,则直接进行插值,不需要加锁 - * 否则,需要加锁后进行处理,避免出现问题 */ + /* 濡傛灉姝ゆ椂Slice杈圭晫鐨勭浉閭昏宸插鐞嗗畬锛屽垯鐩存帴杩涜鎻掑硷紝涓嶉渶瑕佸姞閿 + * 鍚﹀垯锛岄渶瑕佸姞閿佸悗杩涜澶勭悊锛岄伩鍏嶅嚭鐜伴棶棰 */ if (h->param->b_cross_slice_loop_filter == FALSE) { if (row->b_top_slice_border && row->row > 0) { if (is_lcu_row_finished(h, fdec, row->row - 1)) { @@ -131,7 +142,7 @@ void xavs2e_release_row_task(row_info_t *row) } } } else { - /* TODO: 多Slice并行时,对Slice边界的处理 */ + /* TODO: 澶歋lice骞惰鏃讹紝瀵筍lice杈圭晫鐨勫鐞 */ if (h->param->slice_num > 1) { xavs2_log(NULL, XAVS2_LOG_ERROR, "CrossSliceLoopFilter not supported now!\n"); assert(0); @@ -156,7 +167,7 @@ void xavs2e_release_row_task(row_info_t *row) } } } else { - /* TODO: 多Slice并行时,对Slice边界的处理 */ + /* TODO: 澶歋lice骞惰鏃讹紝瀵筍lice杈圭晫鐨勫鐞 */ } set_lcu_row_finished(h, fdec, row->row); xavs2_thread_mutex_unlock(&fdec->mutex); /* unlock */ @@ -229,7 +240,7 @@ xavs2_t *xavs2e_alloc_row_task(xavs2_t *h) memcpy(&h_row_coder->row_vars_1, &h->row_vars_1, (uint8_t *)&h->row_vars_2 - (uint8_t *)&h->row_vars_1); /* make the state of the aec engine same as the one when the slice starts */ - /* 这里h->aec的位置不同导致性能不一样,但是在LCU行编码时重新做了同步保证了一致性 */ + /* 杩欓噷h->aec鐨勪綅缃笉鍚屽鑷存ц兘涓嶄竴鏍凤紝浣嗘槸鍦↙CU琛岀紪鐮佹椂閲嶆柊鍋氫簡鍚屾淇濊瘉浜嗕竴鑷存 */ aec_copy_aec_state(&h_row_coder->aec, &h->aec); /* unlock */ xavs2_thread_mutex_unlock(&h_mgr->mutex); diff --git a/source/encoder/tdrdo.c b/source/encoder/tdrdo.c index 0acf2f0..68b4b1b 100644 --- a/source/encoder/tdrdo.c +++ b/source/encoder/tdrdo.c @@ -54,7 +54,8 @@ typedef struct Frame { uint32_t FrameWidth; uint32_t FrameHeight; uint32_t nStrideY; - pel_t *Y_base; + pel10_t *Y_base10; + pel8_t *Y_base; } Frame; typedef struct BlockDistortion { @@ -163,15 +164,37 @@ static DL *CreatDistortionList(DL *NewDL, uint32_t totalframenumber, uint32_t wi /* --------------------------------------------------------------------------- */ -static double CalculateBlockMSE(Frame *FA, Frame *FB, Block *A, Block *B) +static double CalculateBlockMSE8(Frame *FA, Frame *FB, Block *A, Block *B) { uint16_t x, y; int e, blockpixel = A->BlockHeight * A->BlockWidth; - pel_t *YA, *YB; double dSSE = 0; + pel8_t *YA, *YB; YA = FA->Y_base + A->OriginY * FA->nStrideY + A->OriginX; YB = FB->Y_base + B->OriginY * FB->nStrideY + B->OriginX; + + for (y = 0; y < A->BlockHeight; y++) { + for (x = 0; x < A->BlockWidth; x++) { + e = YA[x] - YB[x]; + dSSE += e * e; + } + YA = YA + FA->nStrideY; + YB = YB + FB->nStrideY; + } + return dSSE / blockpixel; +} + +static double CalculateBlockMSE10(Frame *FA, Frame *FB, Block *A, Block *B) +{ + uint16_t x, y; + int e, blockpixel = A->BlockHeight * A->BlockWidth; + double dSSE = 0; + + pel10_t *YA, *YB; + YA = FA->Y_base10 + A->OriginY * FA->nStrideY + A->OriginX; + YB = FB->Y_base10 + B->OriginY * FB->nStrideY + B->OriginX; + for (y = 0; y < A->BlockHeight; y++) { for (x = 0; x < A->BlockWidth; x++) { e = YA[x] - YB[x]; @@ -185,7 +208,7 @@ static double CalculateBlockMSE(Frame *FA, Frame *FB, Block *A, Block *B) /* --------------------------------------------------------------------------- */ -static void MotionDistortion(FD *currentFD, Frame *FA, Frame *FB, uint32_t searchrange) +static void MotionDistortion(xavs2_t *h, FD *currentFD, Frame *FA, Frame *FB, uint32_t searchrange) { static int dlx[9] = {0, -2, -1, 0, 1, 2, 1, 0, -1}; static int dly[9] = {0, 0, -1, -2, -1, 0, 1, 2, 1}; @@ -267,13 +290,23 @@ static void MotionDistortion(FD *currentFD, Frame *FA, Frame *FB, uint32_t searc if (x >= left && x <= right && y >= top && y <= bottom) { pBB->OriginX = x; pBB->OriginY = y; - currentMSE = CalculateBlockMSE(FA, FB, pBA, pBB); + if (h->param->input_sample_bit_depth == 8) { + currentMSE = CalculateBlockMSE8(FA, FB, pBA, pBB); if (currentMSE < candidateMSE) { candidateMSE = currentMSE; currentBD->MSE = currentMSE; nextcx = x; nextcy = y; } + } else { + currentMSE = CalculateBlockMSE10(FA, FB, pBA, pBB); + if (currentMSE < candidateMSE) { + candidateMSE = currentMSE; + currentBD->MSE = currentMSE; + nextcx = x; + nextcy = y; + } + } } } if (cy == nextcy && cx == nextcx) { @@ -621,20 +654,37 @@ void tdrdo_frame_start(xavs2_t *h) } td_rdo->pRealFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pRealFD->TotalNumOfBlocks, sizeof(BD)); if (td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0) { + if (h->param->input_sample_bit_depth == 8) { if (h->fenc->i_frame == 0) { - td_rdo->porgF.Y_base = h->fenc->planes[IMG_Y]; + td_rdo->porgF.Y_base = h->fenc->planes8[IMG_Y]; td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; - td_rdo->ppreF.Y_base = h->img_luma_pre->planes[IMG_Y]; + td_rdo->ppreF.Y_base = h->img_luma_pre->planes8[IMG_Y]; td_rdo->ppreF.nStrideY = h->img_luma_pre->i_stride[IMG_Y]; xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); } else if ((int)h->fenc->i_frame < h->param->num_frames) { td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[td_rdo->GlobeFrameNumber - 1]; td_rdo->pOMCPFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pOMCPFD->TotalNumOfBlocks, sizeof(BD)); - td_rdo->porgF.Y_base = h->fenc->planes[IMG_Y]; + td_rdo->porgF.Y_base = h->fenc->planes8[IMG_Y]; td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; - MotionDistortion(td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE); + MotionDistortion(h, td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE); xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); } + } else { + if (h->fenc->i_frame == 0) { + td_rdo->porgF.Y_base10 = h->fenc->planes10[IMG_Y]; + td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; + td_rdo->ppreF.Y_base10 = h->img_luma_pre->planes10[IMG_Y]; + td_rdo->ppreF.nStrideY = h->img_luma_pre->i_stride[IMG_Y]; + xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); + } else if ((int)h->fenc->i_frame < h->param->num_frames) { + td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[td_rdo->GlobeFrameNumber - 1]; + td_rdo->pOMCPFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pOMCPFD->TotalNumOfBlocks, sizeof(BD)); + td_rdo->porgF.Y_base10 = h->fenc->planes10[IMG_Y]; + td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; + MotionDistortion(h, td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE); + xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); + } + } td_rdo->pOMCPFD = NULL; } @@ -653,10 +703,14 @@ void tdrdo_frame_done(xavs2_t *h) assert(td_rdo != NULL); if ((h->fenc->i_frame % td_rdo->StepLength == 0 && !h->param->num_bframes) || h->param->num_bframes) { - td_rdo->precF.Y_base = h->fdec->planes[IMG_Y]; + if (h->param->input_sample_bit_depth == 8) { + td_rdo->precF.Y_base = h->fdec->planes8[IMG_Y]; + } else { + td_rdo->precF.Y_base10 = h->fdec->planes10[IMG_Y]; + } //td_rdo->precF.nStrideY = h->fdec->i_stride[IMG_Y];// fdec->stride[0] , bitrate rise ? td_rdo->precF.nStrideY = h->img_luma_pre->i_stride[IMG_Y]; //to check: fdec->stride[0] ? by lutao - MotionDistortion(td_rdo->pRealFD, &td_rdo->porgF, &td_rdo->precF, 0); + MotionDistortion(h, td_rdo->pRealFD, &td_rdo->porgF, &td_rdo->precF, 0); } td_rdo->pRealFD->FrameNumber = h->fenc->i_frame; td_rdo->globenumber++; @@ -706,7 +760,7 @@ void tdrdo_lcu_adjust_lambda(xavs2_t *h, rdcost_t *new_lambda) // Just for LDP if (h->i_type != SLICE_TYPE_I && h->param->num_bframes == 0) { AdjustLcuQPLambdaLDP(h, td_rdo->pOMCPFD, h->lcu.i_scu_xy, h->i_width_in_mincu, new_lambda); - td_rdo->CurMBQP = XAVS2_CLIP3F(MIN_QP, MAX_QP, td_rdo->CurMBQP); + td_rdo->CurMBQP = XAVS2_CLIP3F(MIN_QP, MAX_QP + (h->param->sample_bit_depth - 8) * 8, td_rdo->CurMBQP); } } diff --git a/source/encoder/wrapper.h b/source/encoder/wrapper.h index 13c3f7e..4f75d51 100644 --- a/source/encoder/wrapper.h +++ b/source/encoder/wrapper.h @@ -48,7 +48,8 @@ */ // function type -typedef void(*vpp_ipred_t)(pel_t *p_pred, pel_t *p_top, pel_t *p_left); +typedef void(*vpp_ipred8_t)(pel8_t *p_pred, pel8_t *p_top, pel8_t *p_left); +typedef void(*vpp_ipred10_t)(pel10_t *p_pred, pel10_t *p_top, pel10_t *p_left); /* --------------------------------------------------------------------------- * lookahead_t @@ -63,26 +64,44 @@ typedef struct lookahead_t { /* --------------------------------------------------------------------------- * low resolution of frame (luma plane) */ -typedef struct frm_lowres_t { +typedef struct frm_lowres8_t { int i_width; /* width for luma plane */ int i_lines; /* height for luma plane */ int i_stride; /* stride for luma plane */ - pel_t *filtered; /* half-size copy of input frame (luma only) */ -} frm_lowres_t; + pel8_t *filtered8; /* half-size copy of input frame (luma only) */ +} frm_lowres8_t; + +typedef struct frm_lowres10_t { + int i_width; /* width for luma plane */ + int i_lines; /* height for luma plane */ + int i_stride; /* stride for luma plane */ + pel10_t *filtered10; /* half-size copy of input frame (luma only) */ +} frm_lowres10_t; /* --------------------------------------------------------------------------- * video pre-processing motion estimation */ -typedef struct vpp_me_t { +typedef struct vpp8_me_t { + int mv_min[2]; /* full pel MV range for motion search (min) */ + int mv_max[2]; /* full pel MV range for motion search (max) */ + mv_t bmv; /* [OUT] best motion vector */ + mv_t pmv; /* pred motion vector for the current block */ + uint16_t *mvbits; /* used for getting the mv bits */ + pixel8_cmp_t sad8_8x8; /* function handle for cal sad of 8x8 block */ + pixel8_cmp_x3_t sad8_8x8_x3; /* function handle for cal sad of 8x8 block (X3) */ + pixel8_cmp_x4_t sad8_8x8_x4; /* function handle for cal sad of 8x8 block (X4) */ +} vpp8_me_t; + +typedef struct vpp10_me_t { int mv_min[2]; /* full pel MV range for motion search (min) */ int mv_max[2]; /* full pel MV range for motion search (max) */ mv_t bmv; /* [OUT] best motion vector */ mv_t pmv; /* pred motion vector for the current block */ uint16_t *mvbits; /* used for getting the mv bits */ - pixel_cmp_t sad_8x8; /* function handle for cal sad of 8x8 block */ - pixel_cmp_x3_t sad_8x8_x3; /* function handle for cal sad of 8x8 block (X3) */ - pixel_cmp_x4_t sad_8x8_x4; /* function handle for cal sad of 8x8 block (X4) */ -} vpp_me_t; + pixel10_cmp_t sad10_8x8; /* function handle for cal sad of 8x8 block */ + pixel10_cmp_x3_t sad10_8x8_x3; /* function handle for cal sad of 8x8 block (X3) */ + pixel10_cmp_x4_t sad10_8x8_x4; /* function handle for cal sad of 8x8 block (X4) */ +} vpp10_me_t; /* --------------------------------------------------------------------------- * frame buffer manager diff --git a/source/encoder/xavs2.c b/source/encoder/xavs2.c index b2e6d54..631b215 100644 --- a/source/encoder/xavs2.c +++ b/source/encoder/xavs2.c @@ -161,7 +161,7 @@ xavs2_param_t *xavs2_encoder_opt_alloc(void) param->enable_alf = TRUE; param->alf_LowLatencyEncoding = FALSE; param->enable_pmvr = TRUE; - param->b_cross_slice_loop_filter = FALSE; // 影响帧级并行编解码的速度,默认禁用 + param->b_cross_slice_loop_filter = FALSE; // 褰卞搷甯х骇骞惰缂栬В鐮佺殑閫熷害锛岄粯璁ょ鐢 param->enable_dmh = TRUE; param->b_fast_2lelvel_tu = FALSE; @@ -280,7 +280,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) /* check parameters */ if (encoder_check_parameters(param) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "error encoder parameters\n"); - goto fail; + goto fail8; } size_ratecontrol = xavs2_rc_get_buffer_size(param); /* rate control */ @@ -294,7 +294,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) CACHE_LINE_SIZE * (XAVS2_INPUT_NUM + 4); /* alloc memory for the encoder wrapper */ - CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size); + CHECKED_MALLOC8(mem_ptr, uint8_t *, mem_size); /* M0: assign the wrapper */ h_mgr = (xavs2_handler_t *)mem_ptr; @@ -333,12 +333,12 @@ void *xavs2_encoder_create(xavs2_param_t *param) #endif if (xavs2_thread_mutex_init(&h_mgr->mutex, NULL)) { - goto fail; + goto fail8; } for (i = 0; i < SIG_COUNT; i++) { if (xavs2_thread_cond_init(&h_mgr->cond[i], NULL)) { - goto fail; + goto fail8; } } @@ -359,7 +359,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) /* create the thread pool */ if (xavs2_threadpool_init(&h_mgr->threadpool_rdo, thread_num, NULL, NULL)) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "Error init thread pool RDO. %d", thread_num); - goto fail; + goto fail8; } h_mgr->num_pool_threads = thread_num; } @@ -374,7 +374,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) if (xl_init(&h_mgr->list_frames_free) != 0 || xl_init(&h_mgr->list_frames_output) != 0 || xl_init(&h_mgr->list_frames_ready) != 0) { - goto fail; + goto fail8; } /* init rate-control buffer */ @@ -385,7 +385,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) if (xavs2_rc_init(h_mgr->rate_control, param) < 0) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create rate control fail\n"); - goto fail; + goto fail8; } @@ -397,20 +397,20 @@ void *xavs2_encoder_create(xavs2_param_t *param) if (tdrdo_init(h_mgr->td_rdo, param) != 0) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "init td-rdo fail\n"); - goto fail; + goto fail8; } } /* create an encoder handler */ h_mgr->p_coder = encoder_open(param, h_mgr); if (h_mgr->p_coder == NULL) { - goto fail; + goto fail8; } /* create encoder handlers for multi-thread */ if (h_mgr->i_frm_threads > 1 || h_mgr->i_row_threads > 1) { if (encoder_contexts_init(h_mgr->p_coder, h_mgr) < 0) { - goto fail; + goto fail8; } } @@ -422,7 +422,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) if (frm) { xl_append(&h_mgr->list_frames_free, frm); } else { - goto fail; + goto fail8; } } @@ -433,7 +433,7 @@ void *xavs2_encoder_create(xavs2_param_t *param) /* memory check */ if ((uintptr_t)(h_mgr) + mem_size < (uintptr_t)mem_ptr) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to create input frame buffer.\n"); - goto fail; + goto fail8; } /* init lookahead in the encoder wrapper */ @@ -448,12 +448,12 @@ void *xavs2_encoder_create(xavs2_param_t *param) /* create wrapper thread */ if (xavs2_create_thread(&h_mgr->thread_wrapper, proc_wrapper_thread, h_mgr)) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create encoding thread\n"); - goto fail; + goto fail8; } return h_mgr; -fail: +fail8: if (mem_ptr && h_mgr) { xavs2_encoder_destroy(h_mgr); } @@ -532,7 +532,7 @@ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic) /* set properties */ pic->img.in_sample_size = param->input_sample_bit_depth == 8 ? 1 : 2; - pic->img.enc_sample_size = sizeof(pel_t); + pic->img.enc_sample_size = param->input_sample_bit_depth == 8 ? sizeof(pel8_t) : sizeof(pel10_t); pic->img.i_width[0] = param->org_width; pic->img.i_width[1] = param->org_width >> 1; pic->img.i_width[2] = param->org_width >> 1; @@ -541,12 +541,12 @@ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic) pic->img.i_lines[2] = param->org_height >> (param->chroma_format <= CHROMA_420 ? 1 : 0); pic->img.i_csp = XAVS2_CSP_I420; pic->img.i_plane = frame->i_plane; - pic->img.i_stride[0] = frame->i_stride[0] * sizeof(pel_t); - pic->img.i_stride[1] = frame->i_stride[1] * sizeof(pel_t); - pic->img.i_stride[2] = frame->i_stride[2] * sizeof(pel_t); - pic->img.img_planes[0] = (uint8_t *)frame->planes[0]; - pic->img.img_planes[1] = (uint8_t *)frame->planes[1]; - pic->img.img_planes[2] = (uint8_t *)frame->planes[2]; + pic->img.i_stride[0] = param->input_sample_bit_depth == 8 ? frame->i_stride[0] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ; + pic->img.i_stride[1] = param->input_sample_bit_depth == 8 ? frame->i_stride[1] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ; + pic->img.i_stride[2] = param->input_sample_bit_depth == 8 ? frame->i_stride[2] * sizeof(pel8_t) : frame->i_stride[0] * sizeof(pel10_t) ; + pic->img.img_planes[0] = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[0] : (uint8_t *)frame->planes10[0]; + pic->img.img_planes[1] = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[1] : (uint8_t *)frame->planes10[0]; + pic->img.img_planes[2] = param->input_sample_bit_depth == 8 ? (uint8_t *)frame->planes8[2] : (uint8_t *)frame->planes10[0]; pic->priv = frame; /* keep trace of this frame */ return 0; diff --git a/source/encoder/xavs2_api.c b/source/encoder/xavs2_api.c index 43fa6b2..7bd28f7 100644 --- a/source/encoder/xavs2_api.c +++ b/source/encoder/xavs2_api.c @@ -57,7 +57,24 @@ static xavs2_api_t api_default = { XVERSION_STR, VER_MAJOR * 10 + VER_MINOR, - BIT_DEPTH, + 8, + xavs2_encoder_opt_help, + xavs2_encoder_opt_alloc, + xavs2_encoder_opt_set, + xavs2_encoder_opt_set2, + xavs2_encoder_opt_get, + xavs2_encoder_opt_destroy, + xavs2_encoder_get_buffer, + xavs2_encoder_create, + xavs2_encoder_destroy, + xavs2_encoder_encode, + xavs2_encoder_packet_unref, +}; + +static xavs2_api_t api_default10 = { + XVERSION_STR, + VER_MAJOR * 10 + VER_MINOR, + 10, xavs2_encoder_opt_help, xavs2_encoder_opt_alloc, xavs2_encoder_opt_set, @@ -78,7 +95,7 @@ typedef const xavs2_api_t *(*xavs2_api_get_t)(int bit_depth); static const xavs2_api_t *xavs2_load_new_module(const char *dll_path, const char *methofd_name, int bit_depth) { - /* TODO: 在使用错误的库时, 会出现递归调用此函数最终导致崩溃 */ + /* TODO: 鍦ㄤ娇鐢ㄩ敊璇殑搴撴椂, 浼氬嚭鐜伴掑綊璋冪敤姝ゅ嚱鏁版渶缁堝鑷村穿婧 */ #if _WIN32 HMODULE h = LoadLibraryA(dll_path); if (h) { @@ -116,10 +133,12 @@ xavs2_api_get(int bit_depth) const char* method_name = "xavs2_api_get"; switch (bit_depth) { - case BIT_DEPTH: + case 8: return &api_default; + case 10: + return &api_default10; default: - sprintf(s_lib_name, "libxavs2-%d-%dbit.%s", VER_MAJOR * 10 + VER_MINOR, bit_depth, ext_dyn_lib); + sprintf(s_lib_name, "libxavs2-%d-%dbit", VER_MAJOR * 10 + VER_MINOR, bit_depth); return xavs2_load_new_module(s_lib_name, method_name, bit_depth); } } diff --git a/source/encoder/xlist.c b/source/encoder/xlist.c index d26c1dc..5912f6b 100644 --- a/source/encoder/xlist.c +++ b/source/encoder/xlist.c @@ -37,11 +37,6 @@ #include "common.h" #include "xlist.h" -#if !defined(_MSC_VER) -#include -#include -#endif - /** * =========================================================================== * xlist diff --git a/source/encoder/yuv_writer.c b/source/encoder/yuv_writer.c index 1096d8b..9d47f5b 100644 --- a/source/encoder/yuv_writer.c +++ b/source/encoder/yuv_writer.c @@ -46,20 +46,36 @@ void dump_yuv_out(xavs2_t *h, FILE *fp, xavs2_frame_t *frame, int img_w, int img int j; if (fp != NULL) { - UNUSED_PARAMETER(h); + //UNUSED_PARAMETER(h); + if (h->param->input_sample_bit_depth == 8) { for (j = 0; j < img_h; j++) { - fwrite(frame->planes[0] + j * frame->i_stride[0], img_w, 1, fp); + fwrite(frame->planes8[0] + j * frame->i_stride[0], img_w, 1, fp); } if (frame->i_plane == 3) { for (j = 0; j < (img_h >> 1); j++) { - fwrite(frame->planes[1] + j * frame->i_stride[1], img_w >> 1, 1, fp); + fwrite(frame->planes8[1] + j * frame->i_stride[1], img_w >> 1, 1, fp); } for (j = 0; j < (img_h >> 1); j++) { - fwrite(frame->planes[2] + j * frame->i_stride[2], img_w >> 1, 1, fp); + fwrite(frame->planes8[2] + j * frame->i_stride[2], img_w >> 1, 1, fp); } } + } else { + for (j = 0; j < img_h; j++) { + fwrite(frame->planes10[0] + j * frame->i_stride[0], img_w, 1, fp); + } + + if (frame->i_plane == 3) { + for (j = 0; j < (img_h >> 1); j++) { + fwrite(frame->planes10[1] + j * frame->i_stride[1], img_w >> 1, 1, fp); + } + + for (j = 0; j < (img_h >> 1); j++) { + fwrite(frame->planes10[2] + j * frame->i_stride[2], img_w >> 1, 1, fp); + } + } + } } }