您的位置:首页 > 其它

五级流水线CPU之低功耗设计 (一) :Bypassing(旁路)

2015-03-02 17:20 465 查看
五级流水线整体图示:


某些指令执行的流水线级数:



由上面两幅图可知,流水线的第四级是Data Memory,用于数据的存入和读取,但是CPU的指令集中除了LOAD指令和STORE指令在第四级流水线使用了Data Memory里的数据,其他的指令并没有在这一级流水线进行了任何的操作(并没有使用Data Memory里的数据),也就是说,流水线第四级对除LOAD指令和STORE指令之外的指令并没有实质意义,但却导致了不必要的功率损耗。因此,对于LOAD、STORE之外的指令,四级流水线(将WB级提前一级)的采用可以减少不必要的功率损耗。实现过程图如下:







但是由于某些指令需要五级流水线,而某些只需四级流水线(将WB阶段提前一级),但在实现的过程中要注意冲突问题。

为更清晰的表达利用旁路实现低功耗的原理和讨论冲突问题,先将25条指令(27条指令除去NOP和HALT指令)分为四类:

第一类:

指令:LOAD

IF

ID

EX

MEM

WB

第二类:

指令:STORE,JMPR,BZ,BNZ,BN,BNN,BC,BNC

IF

ID

EX

MEM

NOP

第三类:

指令:LDIH、ADD、ADDI、ADDC、SUB、SUBI、SUBC、OR、AND、SLL、SRL、SLA、SRA

IF

ID

EX

NOP

WB

第四类:

指令:CMP,JUMP

IF

ID

NOP

NOP

NOP

由上面的分类可知:第一类指令必须采用五级流水线;第二类指令,空操作NOP处于流水线第五级(流水线最后一级),没有办法对流水线提前一级;第四类指令,空操作NOP处于流水线第四第五级,也没有办法对流水线提前一级;第三类指令,空操作NOP只处于流水线第四级,可提前一级流水线,但当前指令为第三类中的指令时,上一条指令的WB阶段不为NOP时,若把当前指令的WB阶段提前一级,会出现冲突,也就是说把当前指令的WB阶段提前一级的前提是上一条指令是第二类或者第四类指令,不能为第一类指令。

算术类指令并没有使用到流水线第四阶段,data memory. 同时,STORE指令并不需要WB这一级流水线,而LOAD指令通过了五级流水线。在无用的阶段中的转化会造成额外的功率消耗,这些不必要的转化可以通过在无用的阶段中旁路来实现减少。在算术类指令中不需要访问内存,因此由EX阶段得到的数据可以直接传给第五阶段WB。在此过程中,EX/MEM的流水线寄存器均为0因此保证了没有寄存器的跳转

代码如下:

`define idle 1'b0

`define exec 1'b1

`define NOP 5'b00000

`define HALT 5'b00001

`define LOAD 5'b00010

`define STORE 5'b00011

`define LDIH 5'b00100

`define ADD 5'b00101

`define ADDI 5'b00110

`define ADDC 5'b00111

`define SUB 5'b01000

`define SUBI 5'b01001

`define SUBC 5'b01010

`define CMP 5'b01011

`define AND 5'b01100

`define OR 5'b01101

`define XOR 5'b01110

`define SLL 5'b01111

`define SRL 5'b10000

`define SLA 5'b10001

`define SRA 5'b10010

`define JUMP 5'b10011

`define JMPR 5'b10100

`define BZ 5'b10101

`define BNZ 5'b10110

`define BN 5'b10111

`define BNN 5'b11000

`define BC 5'b11001

`define BNC 5'b11010

module Bypassing(

input clk,reset,enable,start,

input [15:0] d_datain,i_datain,

output wire [7:0] i_addr,

output reg [7:0] d_addr,pc,

output reg [15:0] d_dataout,

output reg d_we

);

reg state,nextstate;

reg [15:0] gr[0:7];

reg [15:0] id_ir,ex_ir,mem_ir,reg_A,reg_B,reg_C,ALUo,smdr,smdr1,reg_C1,wb_ir;

reg dw,zf,nf,cf;

assign i_addr = pc;

//************* CPU control(双过程) *************//

always @(posedge clk or negedge reset)

begin

if (!reset)

state <= `idle;

else

state <= nextstate;

end

always @(*)

begin

case (state)

`idle : begin

if ((enable == 1'b1) && (start == 1'b1))

nextstate <= `exec;

else

nextstate <= `idle;

end

`exec : begin

if ((enable == 1'b0) || (wb_ir[15:11] == `HALT))//HALT

nextstate <= `idle;

else

nextstate <= `exec;

end

endcase

end

//************* IF : Instruction fetch *************//

always @(posedge clk or negedge reset)

begin

if (!reset)

begin

id_ir <= 16'b0000_0000_0000_0000;

pc <= 8'b0000_0000;

end

else if (state ==`exec)

begin

if( ((mem_ir[15:11] == `BZ) && (zf == 1'b1)) || ((mem_ir[15:11] == `BN) && (nf == 1'b1))

|| ((mem_ir[15:11] == `BNZ) && (zf == 1'b0)) || ((mem_ir[15:11] == `BNN) && (nf == 1'b0))

|| ((mem_ir[15:11] == `BC) && (cf == 1'b1)) || ((mem_ir[15:11] == `BNC) && (cf == 1'b0))

|| (mem_ir[15:11] == `JMPR) )

begin

pc <= reg_C[7:0];

id_ir <= i_datain;

end

//JUMP指令

else if(i_datain[15:11] == `JUMP)

begin

pc <= i_datain[7:0];

id_ir <= i_datain;

end

//********上一条指令为LOAD与当前指令的寄存器相同时会引起数据冒险与阻塞stall:引入气泡(延迟一个周期)"*******//

else if((id_ir[15:11] == `LOAD) && (i_datain[15:11] != `NOP) && (i_datain[15:11] != `HALT)

&& (i_datain[15:11] != `LOAD) && (i_datain[15:11] != `JUMP) )

begin

//*********** 当前指令的r1与 上一条指令中r1相同,发生冲突的前提是:上一条的指令使用到r1************//

//***** 使用到r1的指令有 STORE、LDIH、ADDI、SUBI、JMPR、BZ、BNZ、BN、BNN、BC、BNC *******//

if((id_ir[10:8] == i_datain[10:8]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `LDIH)

|| (i_datain[15:11] == `ADDI) || (i_datain[15:11] == `SUBI) || (i_datain[15:11] == `JMPR)

|| (i_datain[15:11] == `BZ) || (i_datain[15:11] == `BNZ) || (i_datain[15:11] == `BN)

|| (i_datain[15:11] == `BNN) || (i_datain[15:11] == `BC) || (i_datain[15:11] == `BNC) ))

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

//************* r2 **************//

else if((id_ir[10:8] == i_datain[6:4]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `ADD)

||(i_datain[15:11] == `ADDC) || (i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC)

||(i_datain[15:11] == `CMP) || (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR)

||(i_datain[15:11] == `XOR) || (i_datain[15:11] == `SLL) || (i_datain[15:11] == `SRL)

||(i_datain[15:11] == `SLA) || (i_datain[15:11] == `SRA) ) )

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

//************* r3 **************//

else if((id_ir[10:8] == i_datain[2:0]) && ((i_datain[15:11] == `ADD) || (i_datain[15:11] == `ADDC)

||(i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC) || (i_datain[15:11] == `CMP)

|| (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR) ||(i_datain[15:11] == `XOR) ))

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

end

else

begin

pc <= pc + 8'b1;

id_ir <= i_datain;

end

end

else if(state == `idle)

pc <= pc;

end

//************* ID : Instruction Decode *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

ex_ir <= 16'b0000_0000_0000_0000;

reg_A <= 16'b0000_0000_0000_0000;

reg_B <= 16'b0000_0000_0000_0000;

smdr <= 16'b0000_0000_0000_0000;

end

else if (state == `exec)

begin

ex_ir <= id_ir;

//------ reg_A 的赋值(包含Hazard处理) ------//

if ( (id_ir[15:11] == `LDIH) || (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `BZ) || (id_ir[15:11] == `BNZ)

|| (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC) || (id_ir[15:11] == `BNC)

|| (id_ir[15:11] == `JMPR) ) //reg_A 为r1的情况

begin

//******* 一阶数据相关 (solution : data forwarding) *******//

if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_A <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_A <= d_datain;

else

reg_A <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_A <= reg_C1; end

else

begin reg_A <= gr[id_ir[10:8]]; end

if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_A <= 0; end

end

else //reg_A 为r2的情况

begin

//******* 一阶数据相关 *******//

if((id_ir[6:4] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_A <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[6:4] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_A <= d_datain;

else

reg_A <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[6:4] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_A <= reg_C1; end

else

begin reg_A <= gr[id_ir[6:4]]; end

if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_A <= 0; end

end

//----- reg_B 的赋值(包含Hazard处理) ------//

if ((id_ir[15:11] == `LOAD) || (id_ir[15:11] == `SLL) || (id_ir[15:11] == `SRL)

|| (id_ir[15:11] == `SLA) || (id_ir[15:11] == `SRA) || id_ir[15:11] == `STORE )

reg_B <= {12'b0000_0000_0000, id_ir[3:0]};//reg_B为val3的情况

else if(id_ir[15:11] == `LDIH)

reg_B <= {id_ir[7:0],8'b0000_0000};//LDIH : r1 <- r1 + {val2,val3,0000_0000}

else if ( (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `SUBI)|| (id_ir[15:11] == `BZ)

|| (id_ir[15:11] == `BNZ) || (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC)

|| (id_ir[15:11] == `BNC) || (id_ir[15:11] == `JMPR) )

reg_B <= {8'b0000_0000, id_ir[7:0]};//reg_B 为{val2 + val3}的情况

else //reg_B 为r3的情况

begin

//******* 一阶数据相关 *******//

if((id_ir[2:0] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_B <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[2:0] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_B <= d_datain;

else

reg_B <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[2:0] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_B <= reg_C1; end

else

begin reg_B <= gr[id_ir[2:0]]; end

if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_B <= 0; end

end

//------ smdr的赋值(包含Hazard处理) -------//

if (id_ir[15:11] == `STORE)

begin

//******* 一阶数据相关 (solution : data forwarding) *******//

if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin smdr <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

smdr <= d_datain;

else

smdr <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin smdr <= reg_C1; end

else

begin smdr <= gr[id_ir[10:8]]; end

if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin smdr <= 0; end

end

else

smdr <= smdr;

end

end

//************* EX *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

mem_ir <= 0;

zf <= 0;

nf <= 0;

reg_C <= 0;

dw <= 0;

smdr1 <= 0;

end

else if (state == `exec)

begin

//*************** Bypassing ***************//

if(!bypassing)

begin

reg_C <= ALUo;

mem_ir <= ex_ir;

smdr1 <= smdr;

end

//--------------- Finish ------------------//

// mem_ir <= ex_ir;

// reg_C <= ALUo;

// smdr1 <= smdr;

if ( (ex_ir[15:11] == `CMP) )

begin

if (ALUo == 16'b0000_0000_0000_0000)

zf <= 1'b1;

else begin

zf <= 1'b0;

if (ALUo[15] == 1'b1)

nf <= 1'b1;

else

nf <= 1'b0;

end

end

else begin

nf <= nf;

zf <= zf;

end

//STORE指令

if (ex_ir[15:11] == `STORE)

dw <= 1'b1;

else

dw <= 1'b0;

end

end

//ALUo

reg signed [15:0] A_reg;//算术右移中间变量

always @(reg_A)

A_reg <= reg_A;

always @(*)

begin

case(ex_ir[15:11])

`LOAD : {cf,ALUo} <= reg_A + reg_B;

`STORE : {cf,ALUo} <= reg_A + reg_B;

`LDIH : {cf,ALUo} <= reg_A + reg_B;

`ADD : {cf,ALUo} <= reg_A + reg_B;

`ADDI : {cf,ALUo} <= reg_A + reg_B;

`ADDC : {cf,ALUo} <= reg_A + reg_B + cf;

`SUB : {cf,ALUo} <= reg_A - reg_B;

`SUBI : {cf,ALUo} <= reg_A - reg_B;

`SUBC : {cf,ALUo} <= reg_A - reg_B - cf;

`CMP : {cf,ALUo} <= reg_A - reg_B;// CMP的功能是set CF NF ZF

`AND : {cf,ALUo} <= reg_A & reg_B;

`OR : {cf,ALUo} <= reg_A | reg_B;

`XOR : {cf,ALUo} <= reg_A ^ reg_B;

`SLL : {cf,ALUo} <= reg_A << reg_B;//逻辑左移,低位补0

`SRL : {cf,ALUo} <= reg_A >> reg_B;//逻辑右移,高位补0

`SLA : {cf,ALUo} <= reg_A <<< reg_B;//算术左移等同于逻辑左移,低位补0

`SRA : {cf,ALUo} <= A_reg >>> reg_B;//算术右移,高位补符号位

`JMPR : {cf,ALUo} <= reg_A + reg_B;

`BZ : {cf,ALUo} <= reg_A + reg_B;

`BNZ : {cf,ALUo} <= reg_A + reg_B;

`BN : {cf,ALUo} <= reg_A + reg_B;

`BNN : {cf,ALUo} <= reg_A + reg_B;

`BC : {cf,ALUo} <= reg_A + reg_B;

`BNC : {cf,ALUo} <= reg_A + reg_B;

default:
begin cf <= cf; ALUo <= ALUo; end

endcase

end

//***************** judge whether bypassing ***************//

reg bypassing;

always @(posedge clk or negedge reset)

begin

if(!reset)

bypassing <= 0;

else if( (ex_ir[15:11] == `LDIH) || (ex_ir[15:11] == `ADD) || (ex_ir[15:11] == `ADDI) || (ex_ir[15:11] == `ADDC)

|| (ex_ir[15:11] == `SUB) || (ex_ir[15:11] == `SUBI) || (ex_ir[15:11] == `SUBC) || (ex_ir[15:11] == `AND)

|| (ex_ir[15:11] == `OR) || (ex_ir[15:11] == `XOR) || (ex_ir[15:11] == `SLL) || (ex_ir[15:11] == `SRL)

|| (ex_ir[15:11] == `SLA) || (ex_ir[15:11] == `SRA) )

begin

if(mem_ir == `LOAD)//上一条指令为LOAD指令时,由于LOAD指令需要五级流水线,如果寄存器提前赋值会出现冲突

bypassing <= 0;

else

bypassing <= 1'b1;

end

else

bypassing <= 0;

end

//************* MEM *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

wb_ir <= 0;

reg_C1 <= 0;

d_dataout <= 0;

d_we <= 0;

end

else if (state == `exec)

begin

d_addr <= reg_C[7:0];

d_dataout <= smdr1;

d_we <= dw;

//*************** Bypassing **************//

if(bypassing)

wb_ir <= ex_ir;

else

wb_ir <= mem_ir;

if(mem_ir[15:11] == `LOAD)

reg_C1 <= d_datain;

else if(bypassing)

reg_C1 <= ALUo;

else

reg_C1 <= reg_C;

end

end

//************* WB *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

gr[0] <= 16'b0000_0000_0000_0000;

gr[1] <= 16'b0000_0000_0000_0000;

gr[2] <= 16'b0000_0000_0000_0000;

gr[3] <= 16'b0000_0000_0000_0000;

gr[4] <= 16'b0000_0000_0000_0000;

gr[5] <= 16'b0000_0000_0000_0000;

gr[6] <= 16'b0000_0000_0000_0000;

gr[7] <= 16'b0000_0000_0000_0000;

gr[wb_ir[10:8]] <= gr[wb_ir[10:8]];

end

else if (state == `exec)

begin

if ( (wb_ir[15:11] == `LOAD) || (wb_ir[15:11] == `LDIH) || (wb_ir[15:11] == `ADD) || (wb_ir[15:11] == `ADDI)

||(wb_ir[15:11] == `ADDC) || (wb_ir[15:11] == `SUB) || (wb_ir[15:11] == `SUBI) || (wb_ir[15:11] == `SUBC)

||(wb_ir[15:11] == `CMP) || (wb_ir[15:11] == `AND) || (wb_ir[15:11] == `OR) || (wb_ir[15:11] == `XOR)

||(wb_ir[15:11] == `SLL) || (wb_ir[15:11] == `SRL) || (wb_ir[15:11] == `SLA) || (wb_ir[15:11] == `SRA) )

gr[wb_ir[10:8]] <= reg_C1;

end

end

endmodule

优化前后的XPOWER测试结果比较:

Basic design:



With clock gating:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: