您的位置:首页 > 职场人生

HTK3.4程序员手册(2.3)--特征参数提取HParm.c

2014-04-17 23:28 543 查看
HTK3.4程序员手册(2.3)--特征参数提取HParm.c

by 云龙

HTK book中提到的参数有11种:

"LPC", "LPREFC", "LPCEPSTRA",   "LPDELCEP", "IREFC", "MFCC", "FBANK", "MELSPEC","DISCRETE", "PLP","ANON"



但HTK3.4中是否都支持呢?

请看ConvertFrame()函数中的以下代码:

   switch(btgt){

   case LPC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      v = cf->a; bsize = cf->lpcOrder;

      break;

   case LPREFC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      v = cf->k; bsize = cf->lpcOrder;

      break;     

   case LPCEPSTRA:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      LPC2Cepstrum(cf->a,cf->c);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c; bsize = cf->numCepCoef;

      break;

   case MELSPEC:

   case FBANK:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);

      v = cf->fbank; bsize = cf->numChans;

      break;

   case MFCC:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);

      FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c; bsize = cf->numCepCoef;

      break;

   case PLP:

      Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);

      FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);

      ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c;

      bsize = cf->numCepCoef;

      break;

   default:

      HError(6321,"ConvertFrame: target %s is not a parameterised form",

             ParmKind2Str(cf->tgtPK,buf));

   }

可以看出HTK3.4支持7中参数:LPC,LPREFC,LPCEPSTRA,MELSPEC,FBANK,MFCC,PLP。

参数转换顺序可以参照HTK book Fig.5.9:



IOConfigRec数据结构存放着很多参数,在特征提取中:
typedef struct {
   /* ------- Overrideable parameters ------- */
   ParmKind srcPK;            /* Source ParmKind */

   FileFormat srcFF;          /* Source File format */

   HTime srcSampRate;         /* Source Sample Rate */

   Boolean zMeanSrc;          /* Zero Mean the Source */
   ParmKind tgtPK;            /* Target ParmKind */

   FileFormat tgtFF;          /* Target File format */ 

...... 

}IOConfigRec;
 
ValidCodeParms()函数检查analysis.conf的参数是否合理。
/* ValidCodeParms: check to ensure reasonable wave->parm code params */
static void ValidCodeParms(IOConfig cf)
 
/* SetUpForCoding: set style, sizes and  working storage */
static void SetUpForCoding(MemHeap *x, IOConfig cf, int frSize)
 
ValidConversion()函数检查原格式到目标格式的转换是否
可能完成。
/* EXPORT->ValidConversion: checks that src -> tgt conversion is possible */
Boolean ValidConversion (ParmKind src, ParmKind tgt)
 
TotalComps()函数返回
特征参数的
维度。
/* TotalComps: return the total number of components in a parameter vector
   with nStatic components and ParmKind pk */
static int TotalComps(int nStatic, ParmKind pk)
 
在OpenAsChannel()函数中,计算特征参数所需的内存空间:
dBytes = cf->nCols * pbuf->main.maxRows * sizeof(float);
      = 39 * 243 * 4 = 37908
 
在提取特征参数FillBufFromChannel()函数前,调用了StartBuffer()函数,那么StartBuffer()函数有什么作用呢?
/* EXPORT->StartBuffer: start audio and fill the buffer */
void StartBuffer(ParmBuf pbuf)
{
……
   if (pbuf->status == PB_INIT) {
      if (pbuf->cf->useSilDet) ChangeState(pbuf,PB_WAITING);

      else ChangeState(pbuf,PB_FILLING);

   }
……
}
typedef enum {

   PB_INIT,     /* Buffer is initialised and empty */
   PB_WAITING,  /* Buffer is waiting for speech */
   PB_STOPPING, /* Buffer is waiting for silence */
   PB_FILLING,  /* Buffer is filling */
   PB_STOPPED,  /* Buffer has stopped but not yet empty */
   PB_CLEARED   /* Buffer has been emptied */
} PBStatus;
PBStatus status;    /* status of this buffer */
通过ChangeState()函数可以看出,ParmBuf pbuf有一个状态标志PBStatus
status,而StartBuffer()函数就是将ParmBuf的状态标志PBStatus改为PB_FILLING,表示正在填装数据。
 
FillBufFromChannel()函数提取wav数据的特征,FillBufFromChannel()函数在OpenAsChannel()函数中被调用。
/* OpenAsChannel: open and create an audio input buffer */
static ReturnStatus OpenAsChannel(ParmBuf pbuf, int maxObs,

                                  char *fname, FileFormat ff,
                                  TriState silMeasure)
{
……
   if (maxObs==0) {
      /* maxObs==0 indicates want a table straight away */
      StartBuffer(pbuf);
      while(pbuf->status<PB_STOPPED)
         FillBufFromChannel(pbuf,MAX_INT);
   }
……
}
 
FillBufFromChannel()函数中调用了
函数FramesInChannel()和函数GetFrameFromChannel()。
 
从FramesInChannel()函数注释看,好像是提取参数,但仔细一看,原来是返回可以读取的行数(wav语音窗数)。
   /* Fill Buffer with converted static coef vectors */
   newRows=FramesInChannel(pbuf,pbuf->chType);
 
/* Return number of frames that can be read without blocking */
/*       -1 == Done, no more to read. */
/*        0 == May block on reading first frame. */
/*        N == Can read N frames immediately without blocking. */
/*  INT_MAX == Will not block. */
static int FramesInChannel(ParmBuf pbuf,int chType){
……
}
再来看看GetFrameFromChannel()函数,
/* Get a single frame from particular channel */
/*  Return value indicates number of frames read okay */
static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)
 
 
FillBufFromChannel中逐窗提取语音特征参数的for循环:

   /* Read the necessary frames */
   for (i=0; i<newRows; i++) {
      /* But have final check on read just in case */
      if (pbuf->dShort) {
         if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {
            pbuf->chClear=TRUE;
            break;
         }
         sp1 += cf->nCols;

      }
      else {
         if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) {       //提取特征参数
            pbuf->chClear=TRUE;
            break;
         }
         fp1 += cf->nCols;

      }
      pbuf->inRow++;pbuf->main.nRows++;
   }
 

 
//fp1表示存放
特征参数的buffer。Mfcc参数是float型。
// pbuf->main.data
的原型:void *data;       /* parameterised data for this block */
fp1 = (float*) pbuf->main.data + pbuf->main.nRows*cf->nCols;
 
static void FillBufFromChannel(ParmBuf pbuf,int minRows)
{
……
   for (i=0; i<newRows; i++) { //此处newRows就是wav语音文件的窗数(Frame
Number)
      /* But have final check on read just in case */
      if (pbuf->dShort) {
         if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {
            pbuf->chClear=TRUE;
            break;
         }
         sp1 += cf->nCols;

      }
      else {
         if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) { //调用这里
            pbuf->chClear=TRUE;
            break;
         }
         fp1 += cf->nCols;

      }
      pbuf->inRow++;pbuf->main.nRows++;
   }
……
}
 
GetFrameFromChannel()函数调用ConvertFrame()来将语音转换为特征参数。
/* Get a single frame from particular channel */
/*  Return value indicates number of frames read okay */
static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)
{
……
      /* Then convert it to a frame */
      if (ConvertFrame(cf, (float *) vp) != cf->nCvrt)
……
}
 
ConvertFrame()函数是最直接的参数提取函数,原wav语音数据存放于cf->s
中,而cf->s
是Vector s类型,即float*类型。
如何将单声道16b的wav语音存放为float型呢?
HTK将样本点将short int强行转换为float型,在GetWave()函数实现。
/* EXPORT->GetWave: Get next nFrames from w and store in buf */
void GetWave(Wave w, int nFrames, float *buf)
{

..... 
*buf++ = w->data[w->frIdx+k];          
//将short int转换为float,存放于cf->s中

...... 
}
 
/* ConvertFrame: convert frame in cf->s and store in pbuf, return total
   parameters stored in pbuf */
static int ConvertFrame(IOConfig cf, float *pbuf)
{
   ParmKind btgt = cf->tgtPK&BASEMASK;
   float re,rawte=0.0,te,*p, cepScale = 1.0;
   int i,bsize=0;
   Vector v=NULL;
   char buf[50];
   Boolean rawE;
  

   p = pbuf;
   rawE = cf->rawEnergy;
   if (btgt<MFCC && cf->v1Compat)
      rawE = FALSE;
 
   if (cf->addDither!=0.0)
      for (i=1; i<=VectorSize(cf->s); i++)
         cf->s[i] += (RandomValue()*2.0 - 1.0)*cf->addDither;
 
   if (cf->zMeanSrc && !cf->v1Compat)
      ZeroMeanFrame(cf->s);
   if ((cf->tgtPK&HASENERGY) && rawE){
      rawte = 0.0;
      for (i=1; i<=VectorSize(cf->s); i++)
         rawte += cf->s[i] * cf->s[i];
   }
   if (cf->preEmph>0.0)

      PreEmphasise(cf->s,cf->preEmph);
   if (cf->useHam) Ham(cf->s);
   switch(btgt){
   case LPC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      v = cf->a; bsize = cf->lpcOrder;
      break;
   case LPREFC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      v = cf->k; bsize = cf->lpcOrder;
      break;     

   case LPCEPSTRA:
      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      LPC2Cepstrum(cf->a,cf->c);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c; bsize = cf->numCepCoef;
      break;
   case MELSPEC:
   case FBANK:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
      v = cf->fbank; bsize = cf->numChans;
      break;
   case MFCC:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
      FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c; bsize = cf->numCepCoef;
      break;
   case PLP:
      Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);
      FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);
      ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c;

      bsize = cf->numCepCoef;
      break;
   default:
      HError(6321,"ConvertFrame: target %s is not a parameterised form",
             ParmKind2Str(cf->tgtPK,buf));
   }
 
   if (btgt == PLP || btgt == MFCC)
      cepScale = (cf->v1Compat) ? 1.0 : cf->cepScale;
   for (i=1; i<=bsize; i++)

      *p++ = v[i] * cepScale;
 
   if (cf->tgtPK&HASZEROC){
      if (btgt == MFCC) {
         *p = FBank2C0(cf->fbank) * cepScale;
         if (cf->v1Compat) *p *= cf->eScale;
         ++p;
      }
      else      /* For PLP include gain as C0 */
         *p++ = v[bsize+1] * cepScale;  

      cf->curPK|=HASZEROC ;
   }
   if (cf->tgtPK&HASENERGY) {
      if (rawE) te = rawte;
      *p++ = (te<MINLARG) ? LZERO : log(te); 

      cf->curPK|=HASENERGY;
   }
   return p - pbuf;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  语音识别 HTK3.4 Wav