x264 - x264_mb_encode_i16x16
2014-06-19 20:54
369 查看
/* All encoding functions must output the correct CBP and NNZ values.
* The entropy coding functions will check CBP first, then NNZ, before
* actually reading the DCT coefficients. NNZ still must be correct even
* if CBP is zero because of the use of NNZ values for context selection.
* "NNZ" need only be 0 or 1 rather than the exact coefficient count because
* that is only needed in CAVLC, and will be calculated by CAVLC's residual
* coding and stored as necessary. */
/* This means that decimation can be done merely by adjusting the CBP and NNZ
* rather than memsetting the coefficients. */
static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
{
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
// dctcoef dct4x4[16[16]
// dctcoef dct_dc4x4[16]
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
int i_quant_cat = p ? CQM_4IC : CQM_4IY;
// test, first mb, i_mode = 6
int i_mode = h->mb.i_intra16x16_pred_mode;
if( h->mb.b_lossless )
x264_predict_lossless_16x16( h, p, i_mode );
else // h->predict_16x16[6] = x264_predict_16x16_dc_128_c
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
if( h->mb.b_lossless )
{
for( int i = 0; i < 16; i++ )
{
int oe = block_idx_xy_fenc[i];
int od = block_idx_xy_fdec[i];
nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
block_cbp |= nz;
}
h->mb.i_cbp_luma |= block_cbp * 0xf;
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
return;
}
// see CLEAR_16X16_nnz comments
CLEAR_16x16_NNZ( p );
// h->dctf.sub16x16_dct = sub16x16_dct
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
if( h->mb.b_noise_reduction )
for( int idx = 0; idx < 16; idx++ )
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
// split 16 dc from 16 4x4 dctcoef,
// pls attention the relation of corresponding
for( int idx = 0; idx < 16; idx++ )
{
dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
dct4x4[idx][0] = 0;
}
if( h->mb.b_trellis )
{
for( int idx = 0; idx < 16; idx++ )
if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
{
block_cbp = 0xf;
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
else
{
// quant 16x16 block by 4 8x8 blocks
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
{
// for each 8x8 block, quant it by 4 4x4 blocks
// h->quantf.quant_4x4x4 = quant_4x4x4
// and each nz is or-shifted
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
// if exist nonzero coef
if( nz )
{
block_cbp = 0xf;
// for nz of each 4x4 block
FOREACH_BIT( idx, i8x8*4, nz )
{
// zigzag scan dctcoef quanted, and save it to h->dct.luma4x4 as level
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
// dequant dctcoef of each nz 4x4 block
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
// mark nz to mb.cache.non_zero_count for each nz 4x4 block
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
}
}
/* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
/* More useful with CAVLC, but still useful with CABAC. */
if( decimate_score < 6 )
{
CLEAR_16x16_NNZ( p );
block_cbp = 0;
}
else
h->mb.i_cbp_luma |= block_cbp;
// h->dctf.dct4x4dc = dct4x4dc
// do H4 transform against 16 dc coef as one 4x4 block
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.b_trellis )
nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
else // h->quantf.quant_4x4_dc = quant_4x4_dc, quant dct_dc4x4 transformed by H4
nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
// save nz mark to mb's dc position of mb.cache.non_zero count
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
if( nz )
{
// if exist nonzero dct_dc4x4 coef
// zigzag scan this dct_dc4x4 coef, then save it to h->dct.luma16x16_dc as level
// h->zigzagf.scan_4x4 = zigzag_scan_4x4_frame for frame encode
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
/* output samples to fdec */
// h->dctf.idct4x4dc = idct4x4dc
// do inverse dct4x4dc transform against dct_dc4x4 coef
h->dctf.idct4x4dc( dct_dc4x4 );
// dequant it
// h->quantf.dequant_4x4_dc = dequant_4x4_dc
h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */
if( block_cbp )
for( int i = 0; i < 16; i++ ) // fill 16 dc coefs back to 16 dct4x4 blocks
dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
}
/* put pixels to fdec */
if( block_cbp ) // reconstruct whole 16x16 block, add16x16_idct
h->dctf.add16x16_idct( p_dst, dct4x4 );
else if( nz ) // only reconstruct dc parts of 16x16 block, add16x16_idct_dc
h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}
// from scan8 layout
// first 16 entries denotes 16 4x4 Y components
// second 16 entries denotes 16 4x4 U components
// Third 16 entries denotes 16 4x4 V components
// last three entries is Ydc, Udc, Vdc
//
// p = 0, 1, 2, denote Y, U, V plane respectively
// x264_scan8[16 * p] to get corresponding position
// p plane
//
#define CLEAR_16x16_NNZ( p ) \
do\
{\
// locate to 4x4 block 0, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 0, 1, 4 ,5 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\
// locate to 4x4 block 2, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 2, 3, 6, 7 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\
// locate to 4x4 block 8, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 8, 9, 12, 13 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\
// locate to 4x4 block 10, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 10, 11, 14, 15 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\
} while(0)
/* Scan8 organization:
* 0 1 2 3 4 5 6 7
* 0 DY y y y y y
* 1 y Y Y Y Y
* 2 y Y Y Y Y
* 3 y Y Y Y Y
* 4 y Y Y Y Y
* 5 DU u u u u u
* 6 u U U U U
* 7 u U U U U
* 8 u U U U U
* 9 u U U U U
* 10 DV v v v v v
* 11 v V V V V
* 12 v V V V V
* 13 v V V V V
* 14 v V V V V
* DY/DU/DV are for luma/chroma DC.
*/
#define LUMA_DC 48
#define CHROMA_DC 49
static const uint8_t x264_scan8[16*3 + 3] =
{
4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
4+11*8, 5+11*8, 4+12*8, 5+12*8,
6+11*8, 7+11*8, 6+12*8, 7+12*8,
4+13*8, 5+13*8, 4+14*8, 5+14*8,
6+13*8, 7+13*8, 6+14*8, 7+14*8,
0+ 0*8, 0+ 5*8, 0+10*8
};
* The entropy coding functions will check CBP first, then NNZ, before
* actually reading the DCT coefficients. NNZ still must be correct even
* if CBP is zero because of the use of NNZ values for context selection.
* "NNZ" need only be 0 or 1 rather than the exact coefficient count because
* that is only needed in CAVLC, and will be calculated by CAVLC's residual
* coding and stored as necessary. */
/* This means that decimation can be done merely by adjusting the CBP and NNZ
* rather than memsetting the coefficients. */
static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
{
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
// dctcoef dct4x4[16[16]
// dctcoef dct_dc4x4[16]
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
int i_quant_cat = p ? CQM_4IC : CQM_4IY;
// test, first mb, i_mode = 6
int i_mode = h->mb.i_intra16x16_pred_mode;
if( h->mb.b_lossless )
x264_predict_lossless_16x16( h, p, i_mode );
else // h->predict_16x16[6] = x264_predict_16x16_dc_128_c
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
if( h->mb.b_lossless )
{
for( int i = 0; i < 16; i++ )
{
int oe = block_idx_xy_fenc[i];
int od = block_idx_xy_fdec[i];
nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
block_cbp |= nz;
}
h->mb.i_cbp_luma |= block_cbp * 0xf;
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
return;
}
// see CLEAR_16X16_nnz comments
CLEAR_16x16_NNZ( p );
// h->dctf.sub16x16_dct = sub16x16_dct
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
if( h->mb.b_noise_reduction )
for( int idx = 0; idx < 16; idx++ )
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
// split 16 dc from 16 4x4 dctcoef,
// pls attention the relation of corresponding
for( int idx = 0; idx < 16; idx++ )
{
dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
dct4x4[idx][0] = 0;
}
if( h->mb.b_trellis )
{
for( int idx = 0; idx < 16; idx++ )
if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
{
block_cbp = 0xf;
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
else
{
// quant 16x16 block by 4 8x8 blocks
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
{
// for each 8x8 block, quant it by 4 4x4 blocks
// h->quantf.quant_4x4x4 = quant_4x4x4
// and each nz is or-shifted
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
// if exist nonzero coef
if( nz )
{
block_cbp = 0xf;
// for nz of each 4x4 block
FOREACH_BIT( idx, i8x8*4, nz )
{
// zigzag scan dctcoef quanted, and save it to h->dct.luma4x4 as level
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
// dequant dctcoef of each nz 4x4 block
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
// mark nz to mb.cache.non_zero_count for each nz 4x4 block
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
}
}
/* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
/* More useful with CAVLC, but still useful with CABAC. */
if( decimate_score < 6 )
{
CLEAR_16x16_NNZ( p );
block_cbp = 0;
}
else
h->mb.i_cbp_luma |= block_cbp;
// h->dctf.dct4x4dc = dct4x4dc
// do H4 transform against 16 dc coef as one 4x4 block
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.b_trellis )
nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
else // h->quantf.quant_4x4_dc = quant_4x4_dc, quant dct_dc4x4 transformed by H4
nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
// save nz mark to mb's dc position of mb.cache.non_zero count
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
if( nz )
{
// if exist nonzero dct_dc4x4 coef
// zigzag scan this dct_dc4x4 coef, then save it to h->dct.luma16x16_dc as level
// h->zigzagf.scan_4x4 = zigzag_scan_4x4_frame for frame encode
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
/* output samples to fdec */
// h->dctf.idct4x4dc = idct4x4dc
// do inverse dct4x4dc transform against dct_dc4x4 coef
h->dctf.idct4x4dc( dct_dc4x4 );
// dequant it
// h->quantf.dequant_4x4_dc = dequant_4x4_dc
h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */
if( block_cbp )
for( int i = 0; i < 16; i++ ) // fill 16 dc coefs back to 16 dct4x4 blocks
dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
}
/* put pixels to fdec */
if( block_cbp ) // reconstruct whole 16x16 block, add16x16_idct
h->dctf.add16x16_idct( p_dst, dct4x4 );
else if( nz ) // only reconstruct dc parts of 16x16 block, add16x16_idct_dc
h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}
// from scan8 layout
// first 16 entries denotes 16 4x4 Y components
// second 16 entries denotes 16 4x4 U components
// Third 16 entries denotes 16 4x4 V components
// last three entries is Ydc, Udc, Vdc
//
// p = 0, 1, 2, denote Y, U, V plane respectively
// x264_scan8[16 * p] to get corresponding position
// p plane
//
#define CLEAR_16x16_NNZ( p ) \
do\
{\
// locate to 4x4 block 0, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 0, 1, 4 ,5 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\
// locate to 4x4 block 2, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 2, 3, 6, 7 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\
// locate to 4x4 block 8, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 8, 9, 12, 13 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\
// locate to 4x4 block 10, then let its 4 bytes equal 0
// i.e. let placeholder 4x4 block 10, 11, 14, 15 equal 0
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\
} while(0)
/* Scan8 organization:
* 0 1 2 3 4 5 6 7
* 0 DY y y y y y
* 1 y Y Y Y Y
* 2 y Y Y Y Y
* 3 y Y Y Y Y
* 4 y Y Y Y Y
* 5 DU u u u u u
* 6 u U U U U
* 7 u U U U U
* 8 u U U U U
* 9 u U U U U
* 10 DV v v v v v
* 11 v V V V V
* 12 v V V V V
* 13 v V V V V
* 14 v V V V V
* DY/DU/DV are for luma/chroma DC.
*/
#define LUMA_DC 48
#define CHROMA_DC 49
static const uint8_t x264_scan8[16*3 + 3] =
{
4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
4+11*8, 5+11*8, 4+12*8, 5+12*8,
6+11*8, 7+11*8, 6+12*8, 7+12*8,
4+13*8, 5+13*8, 4+14*8, 5+14*8,
6+13*8, 7+13*8, 6+14*8, 7+14*8,
0+ 0*8, 0+ 5*8, 0+10*8
};
相关文章推荐
- x264 - x264_mb_encode_i16x16
- x264里的2pass指的是什么意思? x264源代码分析2.encode()
- x264代码剖析(九):x264_encoder_encode()函数之x264_slice's'_write()函数
- int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal ) .
- x264代码剖析(五):encode()函数之x264_encoder_open()函数
- x264代码剖析(五):encode()函数之x264_encoder_open()函数
- x264 - x264.mb.i_neighbour8
- x264 encode
- x264 - x264_mb_predict_intra4x4_mode
- x264代码剖析(六):encode()函数之x264_encoder_headers()函数
- X264中的x264_encoder_encode和x264_nal_encode函数
- int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
- x264代码剖析(六):encode()函数之x264_encoder_headers()函数
- X264 - x264_encoder_encode
- x264_mb_predict_mv_16x16
- x264代码剖析(十三):核心算法之帧间预測函数x264_mb_analyse_inter_*()
- h264源码分析之x264_encoder_encode
- x264 - x264_mb_analyse_inter_b16x16
- x264源代码简单分析:宏块编码(Encode)部分
- x264代码剖析(七):encode()函数之x264_encoder_encode()函数