您的位置:首页 > 其它

五级流水线CPU之低功耗设计 (二) :Clock Gating(门控)

2015-03-02 17:33 459 查看
五级流水线整体图示:



Verilog代码综合得到的总框图:



某些指令执行的流水线级数:



如上图所见,很多指令存在着某些stage保持原数据不必刷新寄存器数据的情况(NOP),而刷新计数器数据无疑会产生动态功率损耗,之前的CPU基本设计中无论数据改不改变都刷新寄存器数据,造成了不必要的动态功率损耗。基于此动机,我们可以尝试将某些原本不需要刷新数据但实际刷新了数据的情况消除,而实现消除的方法是:时钟门控(clock gating)。门控基本原理就是通过关闭芯片上暂时用不到的功能和它的时钟,从而实现节省电流消耗的目的。一般而言,门控时一般选择那些不经常使用或者变化的寄存器,观察可以发现除了LOAD指令和STORE指令,流水线的第四级DATA_MEMORY的三个数据d_addr,d_we,d_dataout不必刷新,因此可以通过门控来减少功率损耗。

代码实现如下:

`define idle 1'b0

`define exec 1'b1

`define NOP 5'b00000

`define HALT 5'b00001

`define LOAD 5'b00010

`define STORE 5'b00011

`define LDIH 5'b00100

`define ADD 5'b00101

`define ADDI 5'b00110

`define ADDC 5'b00111

`define SUB 5'b01000

`define SUBI 5'b01001

`define SUBC 5'b01010

`define CMP 5'b01011

`define AND 5'b01100

`define OR 5'b01101

`define XOR 5'b01110

`define SLL 5'b01111

`define SRL 5'b10000

`define SLA 5'b10001

`define SRA 5'b10010

`define JUMP 5'b10011

`define JMPR 5'b10100

`define BZ 5'b10101

`define BNZ 5'b10110

`define BN 5'b10111

`define BNN 5'b11000

`define BC 5'b11001

`define BNC 5'b11010

module clock_gating(

input clk,reset,enable,start,

input [15:0] d_datain,i_datain,

output wire [7:0] i_addr,

output reg [7:0] d_addr,pc,

output reg [15:0] d_dataout,

output reg d_we

);

reg state,nextstate;

reg [15:0] gr[0:7];

reg [15:0] id_ir,ex_ir,mem_ir,reg_A,reg_B,reg_C,ALUo,smdr,smdr1,reg_C1,wb_ir;

reg dw,zf,nf,cf;

assign i_addr = pc;

//************* CPU control *************//

always @(posedge clk or negedge reset)

begin

if (!reset)

state <= `idle;

else

state <= nextstate;

end

always @(*)

begin

case (state)

`idle : begin

if ((enable == 1'b1) && (start == 1'b1))

nextstate <= `exec;

else

nextstate <= `idle;

end

`exec : begin

if ((enable == 1'b0) || (wb_ir[15:11] == `HALT))//HALT

nextstate <= `idle;

else

nextstate <= `exec;

end

endcase

end

//************* IF : Instruction fetch *************//

always @(posedge clk or negedge reset)

begin

if (!reset)

begin

id_ir <= 16'b0000_0000_0000_0000;

pc <= 8'b0000_0000;

end

else if (state ==`exec)

begin

if( ((mem_ir[15:11] == `BZ) && (zf == 1'b1)) || ((mem_ir[15:11] == `BN) && (nf == 1'b1))

|| ((mem_ir[15:11] == `BNZ) && (zf == 1'b0)) || ((mem_ir[15:11] == `BNN) && (nf == 1'b0))

|| ((mem_ir[15:11] == `BC) && (cf == 1'b1)) || ((mem_ir[15:11] == `BNC) && (cf == 1'b0))

|| (mem_ir[15:11] == `JMPR) )

begin

pc <= reg_C[7:0];

id_ir <= i_datain;

end

//JUMP指令

else if(i_datain[15:11] == `JUMP)

begin

pc <= i_datain[7:0];

id_ir <= i_datain;

end

//********上一条指令为LOAD与当前指令的寄存器相同时会引起数据冒险与阻塞stall:引入气泡(延迟一个周期)"*******//

else if((id_ir[15:11] == `LOAD) && (i_datain[15:11] != `NOP) && (i_datain[15:11] != `HALT)

&& (i_datain[15:11] != `LOAD) && (i_datain[15:11] != `JUMP) )

begin

//*********** 当前指令的r1与 上一条指令中r1相同,发生冲突的前提是:上一条的指令使用到r1************//

//***** 使用到r1的指令有 STORE、LDIH、ADDI、SUBI、JMPR、BZ、BNZ、BN、BNN、BC、BNC *******//

if((id_ir[10:8] == i_datain[10:8]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `LDIH)

|| (i_datain[15:11] == `ADDI) || (i_datain[15:11] == `SUBI) || (i_datain[15:11] == `JMPR)

|| (i_datain[15:11] == `BZ) || (i_datain[15:11] == `BNZ) || (i_datain[15:11] == `BN)

|| (i_datain[15:11] == `BNN) || (i_datain[15:11] == `BC) || (i_datain[15:11] == `BNC) ))

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

//************* r2 **************//

else if((id_ir[10:8] == i_datain[6:4]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `ADD)

||(i_datain[15:11] == `ADDC) || (i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC)

||(i_datain[15:11] == `CMP) || (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR)

||(i_datain[15:11] == `XOR) || (i_datain[15:11] == `SLL) || (i_datain[15:11] == `SRL)

||(i_datain[15:11] == `SLA) || (i_datain[15:11] == `SRA) ))

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

//************* r3 **************//

else if((id_ir[10:8] == i_datain[2:0]) && ((i_datain[15:11] == `ADD) || (i_datain[15:11] == `ADDC)

||(i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC) || (i_datain[15:11] == `CMP)

|| (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR) ||(i_datain[15:11] == `XOR) ))

begin

pc <= pc;

id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;

end

end

else

begin

pc <= pc + 8'b1;

id_ir <= i_datain;

end

end

else if(state == `idle)

pc <= pc;

end

//************* ID : Instruction Decode *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

ex_ir <= 16'b0000_0000_0000_0000;

reg_A <= 16'b0000_0000_0000_0000;

reg_B <= 16'b0000_0000_0000_0000;

smdr <= 16'b0000_0000_0000_0000;

end

else if (state == `exec)

begin

ex_ir <= id_ir;

//------ reg_A 的赋值(包含Hazard处理) ------//

if ( (id_ir[15:11] == `LDIH) || (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `BZ) || (id_ir[15:11] == `BNZ)

|| (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC) || (id_ir[15:11] == `BNC)

|| (id_ir[15:11] == `JMPR) ) //reg_A 为r1的情况

begin

//******* 一阶数据相关 (solution : data forwarding) *******//

if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_A <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_A <= d_datain;

else

reg_A <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_A <= reg_C1; end

else

begin reg_A <= gr[id_ir[10:8]]; end

if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_A <= 0; end

end

else //reg_A 为r2的情况

begin

//******* 一阶数据相关 *******//

if((id_ir[6:4] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_A <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[6:4] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_A <= d_datain;

else

reg_A <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[6:4] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_A <= reg_C1; end

else

begin reg_A <= gr[id_ir[6:4]]; end

if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_A <= 0; end

end

//----- reg_B 的赋值(包含Hazard处理) ------//

if ((id_ir[15:11] == `LOAD) || (id_ir[15:11] == `SLL) || (id_ir[15:11] == `SRL)

|| (id_ir[15:11] == `SLA) || (id_ir[15:11] == `SRA) || id_ir[15:11] == `STORE )

reg_B <= {12'b0000_0000_0000, id_ir[3:0]};//reg_B为val3的情况

else if(id_ir[15:11] == `LDIH)

reg_B <= {id_ir[7:0],8'b0000_0000};//LDIH : r1 <- r1 + {val2,val3,0000_0000}

else if ( (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `SUBI)|| (id_ir[15:11] == `BZ)

|| (id_ir[15:11] == `BNZ) || (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC)

|| (id_ir[15:11] == `BNC) || (id_ir[15:11] == `JMPR) )

reg_B <= {8'b0000_0000, id_ir[7:0]};//reg_B 为{val2 + val3}的情况

else //reg_B 为r3的情况

begin

//******* 一阶数据相关 *******//

if((id_ir[2:0] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin reg_B <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[2:0] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

reg_B <= d_datain;

else

reg_B <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[2:0] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin reg_B <= reg_C1; end

else

begin reg_B <= gr[id_ir[2:0]]; end

if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin reg_B <= 0; end

end

//------ smdr的赋值(包含Hazard处理) -------//

if (id_ir[15:11] == `STORE)

begin

//******* 一阶数据相关 (solution : data forwarding) *******//

if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)

&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )

begin smdr <= ALUo; end

//******* 二阶数据相关 *******//

else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)

&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )

begin

if(mem_ir[15:11] == `LOAD)

smdr <= d_datain;

else

smdr <= reg_C;

end

//******* 三阶数据相关 *******//

else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)

&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )

begin smdr <= reg_C1; end

else

begin smdr <= gr[id_ir[10:8]]; end

if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)

|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)

|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)

|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )

begin smdr <= 0; end

end

else

smdr <= smdr;

end

end

//***************** clock gating ***************//

reg clock_gating;

always @(posedge clk or negedge reset)

begin

if(!reset)

clock_gating <= 0;

else if( (mem_ir[15:11] == `LOAD) || (mem_ir[15:11] == `STORE) )

clock_gating <= 1'b1;

else

clock_gating <= 0;

end

//************* EX *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

mem_ir <= 0;

zf <= 0;

nf <= 0;

reg_C <= 0;

dw <= 0;

smdr1 <= 0;

end

else if (state == `exec)

begin

mem_ir <= ex_ir;

reg_C <= ALUo;

smdr1 <= smdr;

if ( (ex_ir[15:11] == `CMP) )

begin

if (ALUo == 16'b0000_0000_0000_0000)

zf <= 1'b1;

else begin

zf <= 1'b0;

if (ALUo[15] == 1'b1)

nf <= 1'b1;

else

nf <= 1'b0;

end

end

else begin

nf <= nf;

zf <= zf;

end

//STORE指令

if (ex_ir[15:11] == `STORE)

dw <= 1'b1;

else

dw <= 1'b0;

end

end

//ALUo

reg signed [15:0] A_reg;//算术右移中间变量

always @(reg_A)

A_reg <= reg_A;

always @(*)

begin

case(ex_ir[15:11])

`LOAD : {cf,ALUo} <= reg_A + reg_B;

`STORE : {cf,ALUo} <= reg_A + reg_B;

`LDIH : {cf,ALUo} <= reg_A + reg_B;

`ADD : {cf,ALUo} <= reg_A + reg_B;

`ADDI : {cf,ALUo} <= reg_A + reg_B;

`ADDC : {cf,ALUo} <= reg_A + reg_B + cf;

`SUB : {cf,ALUo} <= reg_A - reg_B;

`SUBI : {cf,ALUo} <= reg_A - reg_B;

`SUBC : {cf,ALUo} <= reg_A - reg_B - cf;

`CMP : {cf,ALUo} <= reg_A - reg_B;// CMP的功能是set CF NF ZF

`AND : {cf,ALUo} <= reg_A & reg_B;

`OR : {cf,ALUo} <= reg_A | reg_B;

`XOR : {cf,ALUo} <= reg_A ^ reg_B;

`SLL : {cf,ALUo} <= reg_A << reg_B;//逻辑左移,低位补0

`SRL : {cf,ALUo} <= reg_A >> reg_B;//逻辑右移,高位补0

`SLA : {cf,ALUo} <= reg_A <<< reg_B;//算术左移等同于逻辑左移,低位补0

`SRA : {cf,ALUo} <= A_reg >>> reg_B;//算术右移,高位补符号位

`JMPR : {cf,ALUo} <= reg_A + reg_B;

`BZ : {cf,ALUo} <= reg_A + reg_B;

`BNZ : {cf,ALUo} <= reg_A + reg_B;

`BN : {cf,ALUo} <= reg_A + reg_B;

`BNN : {cf,ALUo} <= reg_A + reg_B;

`BC : {cf,ALUo} <= reg_A + reg_B;

`BNC : {cf,ALUo} <= reg_A + reg_B;

default:
begin cf <= cf; ALUo <= ALUo; end

endcase

end

//************* MEM *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

wb_ir <= 0;

reg_C1 <= 0;

d_dataout <= 0;

d_we <= 0;

end

else if (state == `exec)

begin

wb_ir <= mem_ir;

//*************** clcok gating **************//

if(clock_gating) begin

d_dataout <= smdr1;

d_we <= dw;

d_addr <= reg_C[7:0];

end

if(mem_ir[15:11] == `LOAD)

reg_C1 <= d_datain;

else

reg_C1 <= reg_C;

end

end

//************* WB *************//

always @(posedge clk or negedge reset)

begin

if(!reset) begin

gr[0] <= 16'b0000_0000_0000_0000;

gr[1] <= 16'b0000_0000_0000_0000;

gr[2] <= 16'b0000_0000_0000_0000;

gr[3] <= 16'b0000_0000_0000_0000;

gr[4] <= 16'b0000_0000_0000_0000;

gr[5] <= 16'b0000_0000_0000_0000;

gr[6] <= 16'b0000_0000_0000_0000;

gr[7] <= 16'b0000_0000_0000_0000;

gr[wb_ir[10:8]] <= gr[wb_ir[10:8]];

end

else if (state == `exec)

begin

if ( (wb_ir[15:11] == `LOAD) || (wb_ir[15:11] == `LDIH) || (wb_ir[15:11] == `ADD) || (wb_ir[15:11] == `ADDI)

||(wb_ir[15:11] == `ADDC) || (wb_ir[15:11] == `SUB) || (wb_ir[15:11] == `SUBI) || (wb_ir[15:11] == `SUBC)

||(wb_ir[15:11] == `CMP) || (wb_ir[15:11] == `AND) || (wb_ir[15:11] == `OR) || (wb_ir[15:11] == `XOR)

||(wb_ir[15:11] == `SLL) || (wb_ir[15:11] == `SRL) || (wb_ir[15:11] == `SLA) || (wb_ir[15:11] == `SRA) )

gr[wb_ir[10:8]] <= reg_C1;

end

end

endmodule

优化前后的XPOWER测试结果比较:

Basic design:



With clock gating:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: