五级流水线CPU之低功耗设计 (一) :Bypassing(旁路)
2015-03-02 17:20
465 查看
五级流水线整体图示:
某些指令执行的流水线级数:
由上面两幅图可知,流水线的第四级是Data Memory,用于数据的存入和读取,但是CPU的指令集中除了LOAD指令和STORE指令在第四级流水线使用了Data Memory里的数据,其他的指令并没有在这一级流水线进行了任何的操作(并没有使用Data Memory里的数据),也就是说,流水线第四级对除LOAD指令和STORE指令之外的指令并没有实质意义,但却导致了不必要的功率损耗。因此,对于LOAD、STORE之外的指令,四级流水线(将WB级提前一级)的采用可以减少不必要的功率损耗。实现过程图如下:
但是由于某些指令需要五级流水线,而某些只需四级流水线(将WB阶段提前一级),但在实现的过程中要注意冲突问题。
为更清晰的表达利用旁路实现低功耗的原理和讨论冲突问题,先将25条指令(27条指令除去NOP和HALT指令)分为四类:
第一类:
指令:LOAD
第二类:
指令:STORE,JMPR,BZ,BNZ,BN,BNN,BC,BNC
第三类:
指令:LDIH、ADD、ADDI、ADDC、SUB、SUBI、SUBC、OR、AND、SLL、SRL、SLA、SRA
第四类:
指令:CMP,JUMP
由上面的分类可知:第一类指令必须采用五级流水线;第二类指令,空操作NOP处于流水线第五级(流水线最后一级),没有办法对流水线提前一级;第四类指令,空操作NOP处于流水线第四第五级,也没有办法对流水线提前一级;第三类指令,空操作NOP只处于流水线第四级,可提前一级流水线,但当前指令为第三类中的指令时,上一条指令的WB阶段不为NOP时,若把当前指令的WB阶段提前一级,会出现冲突,也就是说把当前指令的WB阶段提前一级的前提是上一条指令是第二类或者第四类指令,不能为第一类指令。
算术类指令并没有使用到流水线第四阶段,data memory. 同时,STORE指令并不需要WB这一级流水线,而LOAD指令通过了五级流水线。在无用的阶段中的转化会造成额外的功率消耗,这些不必要的转化可以通过在无用的阶段中旁路来实现减少。在算术类指令中不需要访问内存,因此由EX阶段得到的数据可以直接传给第五阶段WB。在此过程中,EX/MEM的流水线寄存器均为0因此保证了没有寄存器的跳转
代码如下:
`define idle 1'b0
`define exec 1'b1
`define NOP 5'b00000
`define HALT 5'b00001
`define LOAD 5'b00010
`define STORE 5'b00011
`define LDIH 5'b00100
`define ADD 5'b00101
`define ADDI 5'b00110
`define ADDC 5'b00111
`define SUB 5'b01000
`define SUBI 5'b01001
`define SUBC 5'b01010
`define CMP 5'b01011
`define AND 5'b01100
`define OR 5'b01101
`define XOR 5'b01110
`define SLL 5'b01111
`define SRL 5'b10000
`define SLA 5'b10001
`define SRA 5'b10010
`define JUMP 5'b10011
`define JMPR 5'b10100
`define BZ 5'b10101
`define BNZ 5'b10110
`define BN 5'b10111
`define BNN 5'b11000
`define BC 5'b11001
`define BNC 5'b11010
module Bypassing(
input clk,reset,enable,start,
input [15:0] d_datain,i_datain,
output wire [7:0] i_addr,
output reg [7:0] d_addr,pc,
output reg [15:0] d_dataout,
output reg d_we
);
reg state,nextstate;
reg [15:0] gr[0:7];
reg [15:0] id_ir,ex_ir,mem_ir,reg_A,reg_B,reg_C,ALUo,smdr,smdr1,reg_C1,wb_ir;
reg dw,zf,nf,cf;
assign i_addr = pc;
//************* CPU control(双过程) *************//
always @(posedge clk or negedge reset)
begin
if (!reset)
state <= `idle;
else
state <= nextstate;
end
always @(*)
begin
case (state)
`idle : begin
if ((enable == 1'b1) && (start == 1'b1))
nextstate <= `exec;
else
nextstate <= `idle;
end
`exec : begin
if ((enable == 1'b0) || (wb_ir[15:11] == `HALT))//HALT
nextstate <= `idle;
else
nextstate <= `exec;
end
endcase
end
//************* IF : Instruction fetch *************//
always @(posedge clk or negedge reset)
begin
if (!reset)
begin
id_ir <= 16'b0000_0000_0000_0000;
pc <= 8'b0000_0000;
end
else if (state ==`exec)
begin
if( ((mem_ir[15:11] == `BZ) && (zf == 1'b1)) || ((mem_ir[15:11] == `BN) && (nf == 1'b1))
|| ((mem_ir[15:11] == `BNZ) && (zf == 1'b0)) || ((mem_ir[15:11] == `BNN) && (nf == 1'b0))
|| ((mem_ir[15:11] == `BC) && (cf == 1'b1)) || ((mem_ir[15:11] == `BNC) && (cf == 1'b0))
|| (mem_ir[15:11] == `JMPR) )
begin
pc <= reg_C[7:0];
id_ir <= i_datain;
end
//JUMP指令
else if(i_datain[15:11] == `JUMP)
begin
pc <= i_datain[7:0];
id_ir <= i_datain;
end
//********上一条指令为LOAD与当前指令的寄存器相同时会引起数据冒险与阻塞stall:引入气泡(延迟一个周期)"*******//
else if((id_ir[15:11] == `LOAD) && (i_datain[15:11] != `NOP) && (i_datain[15:11] != `HALT)
&& (i_datain[15:11] != `LOAD) && (i_datain[15:11] != `JUMP) )
begin
//*********** 当前指令的r1与 上一条指令中r1相同,发生冲突的前提是:上一条的指令使用到r1************//
//***** 使用到r1的指令有 STORE、LDIH、ADDI、SUBI、JMPR、BZ、BNZ、BN、BNN、BC、BNC *******//
if((id_ir[10:8] == i_datain[10:8]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `LDIH)
|| (i_datain[15:11] == `ADDI) || (i_datain[15:11] == `SUBI) || (i_datain[15:11] == `JMPR)
|| (i_datain[15:11] == `BZ) || (i_datain[15:11] == `BNZ) || (i_datain[15:11] == `BN)
|| (i_datain[15:11] == `BNN) || (i_datain[15:11] == `BC) || (i_datain[15:11] == `BNC) ))
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
//************* r2 **************//
else if((id_ir[10:8] == i_datain[6:4]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `ADD)
||(i_datain[15:11] == `ADDC) || (i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC)
||(i_datain[15:11] == `CMP) || (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR)
||(i_datain[15:11] == `XOR) || (i_datain[15:11] == `SLL) || (i_datain[15:11] == `SRL)
||(i_datain[15:11] == `SLA) || (i_datain[15:11] == `SRA) ) )
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
//************* r3 **************//
else if((id_ir[10:8] == i_datain[2:0]) && ((i_datain[15:11] == `ADD) || (i_datain[15:11] == `ADDC)
||(i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC) || (i_datain[15:11] == `CMP)
|| (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR) ||(i_datain[15:11] == `XOR) ))
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
end
else
begin
pc <= pc + 8'b1;
id_ir <= i_datain;
end
end
else if(state == `idle)
pc <= pc;
end
//************* ID : Instruction Decode *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
ex_ir <= 16'b0000_0000_0000_0000;
reg_A <= 16'b0000_0000_0000_0000;
reg_B <= 16'b0000_0000_0000_0000;
smdr <= 16'b0000_0000_0000_0000;
end
else if (state == `exec)
begin
ex_ir <= id_ir;
//------ reg_A 的赋值(包含Hazard处理) ------//
if ( (id_ir[15:11] == `LDIH) || (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `BZ) || (id_ir[15:11] == `BNZ)
|| (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC) || (id_ir[15:11] == `BNC)
|| (id_ir[15:11] == `JMPR) ) //reg_A 为r1的情况
begin
//******* 一阶数据相关 (solution : data forwarding) *******//
if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_A <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_A <= d_datain;
else
reg_A <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_A <= reg_C1; end
else
begin reg_A <= gr[id_ir[10:8]]; end
if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_A <= 0; end
end
else //reg_A 为r2的情况
begin
//******* 一阶数据相关 *******//
if((id_ir[6:4] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_A <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[6:4] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_A <= d_datain;
else
reg_A <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[6:4] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_A <= reg_C1; end
else
begin reg_A <= gr[id_ir[6:4]]; end
if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_A <= 0; end
end
//----- reg_B 的赋值(包含Hazard处理) ------//
if ((id_ir[15:11] == `LOAD) || (id_ir[15:11] == `SLL) || (id_ir[15:11] == `SRL)
|| (id_ir[15:11] == `SLA) || (id_ir[15:11] == `SRA) || id_ir[15:11] == `STORE )
reg_B <= {12'b0000_0000_0000, id_ir[3:0]};//reg_B为val3的情况
else if(id_ir[15:11] == `LDIH)
reg_B <= {id_ir[7:0],8'b0000_0000};//LDIH : r1 <- r1 + {val2,val3,0000_0000}
else if ( (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `SUBI)|| (id_ir[15:11] == `BZ)
|| (id_ir[15:11] == `BNZ) || (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC)
|| (id_ir[15:11] == `BNC) || (id_ir[15:11] == `JMPR) )
reg_B <= {8'b0000_0000, id_ir[7:0]};//reg_B 为{val2 + val3}的情况
else //reg_B 为r3的情况
begin
//******* 一阶数据相关 *******//
if((id_ir[2:0] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_B <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[2:0] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_B <= d_datain;
else
reg_B <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[2:0] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_B <= reg_C1; end
else
begin reg_B <= gr[id_ir[2:0]]; end
if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_B <= 0; end
end
//------ smdr的赋值(包含Hazard处理) -------//
if (id_ir[15:11] == `STORE)
begin
//******* 一阶数据相关 (solution : data forwarding) *******//
if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin smdr <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
smdr <= d_datain;
else
smdr <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin smdr <= reg_C1; end
else
begin smdr <= gr[id_ir[10:8]]; end
if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin smdr <= 0; end
end
else
smdr <= smdr;
end
end
//************* EX *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
mem_ir <= 0;
zf <= 0;
nf <= 0;
reg_C <= 0;
dw <= 0;
smdr1 <= 0;
end
else if (state == `exec)
begin
//*************** Bypassing ***************//
if(!bypassing)
begin
reg_C <= ALUo;
mem_ir <= ex_ir;
smdr1 <= smdr;
end
//--------------- Finish ------------------//
// mem_ir <= ex_ir;
// reg_C <= ALUo;
// smdr1 <= smdr;
if ( (ex_ir[15:11] == `CMP) )
begin
if (ALUo == 16'b0000_0000_0000_0000)
zf <= 1'b1;
else begin
zf <= 1'b0;
if (ALUo[15] == 1'b1)
nf <= 1'b1;
else
nf <= 1'b0;
end
end
else begin
nf <= nf;
zf <= zf;
end
//STORE指令
if (ex_ir[15:11] == `STORE)
dw <= 1'b1;
else
dw <= 1'b0;
end
end
//ALUo
reg signed [15:0] A_reg;//算术右移中间变量
always @(reg_A)
A_reg <= reg_A;
always @(*)
begin
case(ex_ir[15:11])
`LOAD : {cf,ALUo} <= reg_A + reg_B;
`STORE : {cf,ALUo} <= reg_A + reg_B;
`LDIH : {cf,ALUo} <= reg_A + reg_B;
`ADD : {cf,ALUo} <= reg_A + reg_B;
`ADDI : {cf,ALUo} <= reg_A + reg_B;
`ADDC : {cf,ALUo} <= reg_A + reg_B + cf;
`SUB : {cf,ALUo} <= reg_A - reg_B;
`SUBI : {cf,ALUo} <= reg_A - reg_B;
`SUBC : {cf,ALUo} <= reg_A - reg_B - cf;
`CMP : {cf,ALUo} <= reg_A - reg_B;// CMP的功能是set CF NF ZF
`AND : {cf,ALUo} <= reg_A & reg_B;
`OR : {cf,ALUo} <= reg_A | reg_B;
`XOR : {cf,ALUo} <= reg_A ^ reg_B;
`SLL : {cf,ALUo} <= reg_A << reg_B;//逻辑左移,低位补0
`SRL : {cf,ALUo} <= reg_A >> reg_B;//逻辑右移,高位补0
`SLA : {cf,ALUo} <= reg_A <<< reg_B;//算术左移等同于逻辑左移,低位补0
`SRA : {cf,ALUo} <= A_reg >>> reg_B;//算术右移,高位补符号位
`JMPR : {cf,ALUo} <= reg_A + reg_B;
`BZ : {cf,ALUo} <= reg_A + reg_B;
`BNZ : {cf,ALUo} <= reg_A + reg_B;
`BN : {cf,ALUo} <= reg_A + reg_B;
`BNN : {cf,ALUo} <= reg_A + reg_B;
`BC : {cf,ALUo} <= reg_A + reg_B;
`BNC : {cf,ALUo} <= reg_A + reg_B;
default:
begin cf <= cf; ALUo <= ALUo; end
endcase
end
//***************** judge whether bypassing ***************//
reg bypassing;
always @(posedge clk or negedge reset)
begin
if(!reset)
bypassing <= 0;
else if( (ex_ir[15:11] == `LDIH) || (ex_ir[15:11] == `ADD) || (ex_ir[15:11] == `ADDI) || (ex_ir[15:11] == `ADDC)
|| (ex_ir[15:11] == `SUB) || (ex_ir[15:11] == `SUBI) || (ex_ir[15:11] == `SUBC) || (ex_ir[15:11] == `AND)
|| (ex_ir[15:11] == `OR) || (ex_ir[15:11] == `XOR) || (ex_ir[15:11] == `SLL) || (ex_ir[15:11] == `SRL)
|| (ex_ir[15:11] == `SLA) || (ex_ir[15:11] == `SRA) )
begin
if(mem_ir == `LOAD)//上一条指令为LOAD指令时,由于LOAD指令需要五级流水线,如果寄存器提前赋值会出现冲突
bypassing <= 0;
else
bypassing <= 1'b1;
end
else
bypassing <= 0;
end
//************* MEM *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
wb_ir <= 0;
reg_C1 <= 0;
d_dataout <= 0;
d_we <= 0;
end
else if (state == `exec)
begin
d_addr <= reg_C[7:0];
d_dataout <= smdr1;
d_we <= dw;
//*************** Bypassing **************//
if(bypassing)
wb_ir <= ex_ir;
else
wb_ir <= mem_ir;
if(mem_ir[15:11] == `LOAD)
reg_C1 <= d_datain;
else if(bypassing)
reg_C1 <= ALUo;
else
reg_C1 <= reg_C;
end
end
//************* WB *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
gr[0] <= 16'b0000_0000_0000_0000;
gr[1] <= 16'b0000_0000_0000_0000;
gr[2] <= 16'b0000_0000_0000_0000;
gr[3] <= 16'b0000_0000_0000_0000;
gr[4] <= 16'b0000_0000_0000_0000;
gr[5] <= 16'b0000_0000_0000_0000;
gr[6] <= 16'b0000_0000_0000_0000;
gr[7] <= 16'b0000_0000_0000_0000;
gr[wb_ir[10:8]] <= gr[wb_ir[10:8]];
end
else if (state == `exec)
begin
if ( (wb_ir[15:11] == `LOAD) || (wb_ir[15:11] == `LDIH) || (wb_ir[15:11] == `ADD) || (wb_ir[15:11] == `ADDI)
||(wb_ir[15:11] == `ADDC) || (wb_ir[15:11] == `SUB) || (wb_ir[15:11] == `SUBI) || (wb_ir[15:11] == `SUBC)
||(wb_ir[15:11] == `CMP) || (wb_ir[15:11] == `AND) || (wb_ir[15:11] == `OR) || (wb_ir[15:11] == `XOR)
||(wb_ir[15:11] == `SLL) || (wb_ir[15:11] == `SRL) || (wb_ir[15:11] == `SLA) || (wb_ir[15:11] == `SRA) )
gr[wb_ir[10:8]] <= reg_C1;
end
end
endmodule
优化前后的XPOWER测试结果比较:
Basic design:
With clock gating:
某些指令执行的流水线级数:
由上面两幅图可知,流水线的第四级是Data Memory,用于数据的存入和读取,但是CPU的指令集中除了LOAD指令和STORE指令在第四级流水线使用了Data Memory里的数据,其他的指令并没有在这一级流水线进行了任何的操作(并没有使用Data Memory里的数据),也就是说,流水线第四级对除LOAD指令和STORE指令之外的指令并没有实质意义,但却导致了不必要的功率损耗。因此,对于LOAD、STORE之外的指令,四级流水线(将WB级提前一级)的采用可以减少不必要的功率损耗。实现过程图如下:
但是由于某些指令需要五级流水线,而某些只需四级流水线(将WB阶段提前一级),但在实现的过程中要注意冲突问题。
为更清晰的表达利用旁路实现低功耗的原理和讨论冲突问题,先将25条指令(27条指令除去NOP和HALT指令)分为四类:
第一类:
指令:LOAD
IF | ID | EX | MEM | WB |
指令:STORE,JMPR,BZ,BNZ,BN,BNN,BC,BNC
IF | ID | EX | MEM | NOP |
指令:LDIH、ADD、ADDI、ADDC、SUB、SUBI、SUBC、OR、AND、SLL、SRL、SLA、SRA
IF | ID | EX | NOP | WB |
指令:CMP,JUMP
IF | ID | NOP | NOP | NOP |
算术类指令并没有使用到流水线第四阶段,data memory. 同时,STORE指令并不需要WB这一级流水线,而LOAD指令通过了五级流水线。在无用的阶段中的转化会造成额外的功率消耗,这些不必要的转化可以通过在无用的阶段中旁路来实现减少。在算术类指令中不需要访问内存,因此由EX阶段得到的数据可以直接传给第五阶段WB。在此过程中,EX/MEM的流水线寄存器均为0因此保证了没有寄存器的跳转
代码如下:
`define idle 1'b0
`define exec 1'b1
`define NOP 5'b00000
`define HALT 5'b00001
`define LOAD 5'b00010
`define STORE 5'b00011
`define LDIH 5'b00100
`define ADD 5'b00101
`define ADDI 5'b00110
`define ADDC 5'b00111
`define SUB 5'b01000
`define SUBI 5'b01001
`define SUBC 5'b01010
`define CMP 5'b01011
`define AND 5'b01100
`define OR 5'b01101
`define XOR 5'b01110
`define SLL 5'b01111
`define SRL 5'b10000
`define SLA 5'b10001
`define SRA 5'b10010
`define JUMP 5'b10011
`define JMPR 5'b10100
`define BZ 5'b10101
`define BNZ 5'b10110
`define BN 5'b10111
`define BNN 5'b11000
`define BC 5'b11001
`define BNC 5'b11010
module Bypassing(
input clk,reset,enable,start,
input [15:0] d_datain,i_datain,
output wire [7:0] i_addr,
output reg [7:0] d_addr,pc,
output reg [15:0] d_dataout,
output reg d_we
);
reg state,nextstate;
reg [15:0] gr[0:7];
reg [15:0] id_ir,ex_ir,mem_ir,reg_A,reg_B,reg_C,ALUo,smdr,smdr1,reg_C1,wb_ir;
reg dw,zf,nf,cf;
assign i_addr = pc;
//************* CPU control(双过程) *************//
always @(posedge clk or negedge reset)
begin
if (!reset)
state <= `idle;
else
state <= nextstate;
end
always @(*)
begin
case (state)
`idle : begin
if ((enable == 1'b1) && (start == 1'b1))
nextstate <= `exec;
else
nextstate <= `idle;
end
`exec : begin
if ((enable == 1'b0) || (wb_ir[15:11] == `HALT))//HALT
nextstate <= `idle;
else
nextstate <= `exec;
end
endcase
end
//************* IF : Instruction fetch *************//
always @(posedge clk or negedge reset)
begin
if (!reset)
begin
id_ir <= 16'b0000_0000_0000_0000;
pc <= 8'b0000_0000;
end
else if (state ==`exec)
begin
if( ((mem_ir[15:11] == `BZ) && (zf == 1'b1)) || ((mem_ir[15:11] == `BN) && (nf == 1'b1))
|| ((mem_ir[15:11] == `BNZ) && (zf == 1'b0)) || ((mem_ir[15:11] == `BNN) && (nf == 1'b0))
|| ((mem_ir[15:11] == `BC) && (cf == 1'b1)) || ((mem_ir[15:11] == `BNC) && (cf == 1'b0))
|| (mem_ir[15:11] == `JMPR) )
begin
pc <= reg_C[7:0];
id_ir <= i_datain;
end
//JUMP指令
else if(i_datain[15:11] == `JUMP)
begin
pc <= i_datain[7:0];
id_ir <= i_datain;
end
//********上一条指令为LOAD与当前指令的寄存器相同时会引起数据冒险与阻塞stall:引入气泡(延迟一个周期)"*******//
else if((id_ir[15:11] == `LOAD) && (i_datain[15:11] != `NOP) && (i_datain[15:11] != `HALT)
&& (i_datain[15:11] != `LOAD) && (i_datain[15:11] != `JUMP) )
begin
//*********** 当前指令的r1与 上一条指令中r1相同,发生冲突的前提是:上一条的指令使用到r1************//
//***** 使用到r1的指令有 STORE、LDIH、ADDI、SUBI、JMPR、BZ、BNZ、BN、BNN、BC、BNC *******//
if((id_ir[10:8] == i_datain[10:8]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `LDIH)
|| (i_datain[15:11] == `ADDI) || (i_datain[15:11] == `SUBI) || (i_datain[15:11] == `JMPR)
|| (i_datain[15:11] == `BZ) || (i_datain[15:11] == `BNZ) || (i_datain[15:11] == `BN)
|| (i_datain[15:11] == `BNN) || (i_datain[15:11] == `BC) || (i_datain[15:11] == `BNC) ))
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
//************* r2 **************//
else if((id_ir[10:8] == i_datain[6:4]) && ((i_datain[15:11] == `STORE) || (i_datain[15:11] == `ADD)
||(i_datain[15:11] == `ADDC) || (i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC)
||(i_datain[15:11] == `CMP) || (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR)
||(i_datain[15:11] == `XOR) || (i_datain[15:11] == `SLL) || (i_datain[15:11] == `SRL)
||(i_datain[15:11] == `SLA) || (i_datain[15:11] == `SRA) ) )
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
//************* r3 **************//
else if((id_ir[10:8] == i_datain[2:0]) && ((i_datain[15:11] == `ADD) || (i_datain[15:11] == `ADDC)
||(i_datain[15:11] == `SUB) || (i_datain[15:11] == `SUBC) || (i_datain[15:11] == `CMP)
|| (i_datain[15:11] == `AND) || (i_datain[15:11] == `OR) ||(i_datain[15:11] == `XOR) ))
begin
pc <= pc;
id_ir <= 16'bxxxx_xxxx_xxxx_xxxx;
end
end
else
begin
pc <= pc + 8'b1;
id_ir <= i_datain;
end
end
else if(state == `idle)
pc <= pc;
end
//************* ID : Instruction Decode *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
ex_ir <= 16'b0000_0000_0000_0000;
reg_A <= 16'b0000_0000_0000_0000;
reg_B <= 16'b0000_0000_0000_0000;
smdr <= 16'b0000_0000_0000_0000;
end
else if (state == `exec)
begin
ex_ir <= id_ir;
//------ reg_A 的赋值(包含Hazard处理) ------//
if ( (id_ir[15:11] == `LDIH) || (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `BZ) || (id_ir[15:11] == `BNZ)
|| (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC) || (id_ir[15:11] == `BNC)
|| (id_ir[15:11] == `JMPR) ) //reg_A 为r1的情况
begin
//******* 一阶数据相关 (solution : data forwarding) *******//
if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_A <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_A <= d_datain;
else
reg_A <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_A <= reg_C1; end
else
begin reg_A <= gr[id_ir[10:8]]; end
if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_A <= 0; end
end
else //reg_A 为r2的情况
begin
//******* 一阶数据相关 *******//
if((id_ir[6:4] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_A <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[6:4] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_A <= d_datain;
else
reg_A <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[6:4] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_A <= reg_C1; end
else
begin reg_A <= gr[id_ir[6:4]]; end
if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_A <= 0; end
end
//----- reg_B 的赋值(包含Hazard处理) ------//
if ((id_ir[15:11] == `LOAD) || (id_ir[15:11] == `SLL) || (id_ir[15:11] == `SRL)
|| (id_ir[15:11] == `SLA) || (id_ir[15:11] == `SRA) || id_ir[15:11] == `STORE )
reg_B <= {12'b0000_0000_0000, id_ir[3:0]};//reg_B为val3的情况
else if(id_ir[15:11] == `LDIH)
reg_B <= {id_ir[7:0],8'b0000_0000};//LDIH : r1 <- r1 + {val2,val3,0000_0000}
else if ( (id_ir[15:11] == `ADDI) || (id_ir[15:11] == `SUBI)|| (id_ir[15:11] == `BZ)
|| (id_ir[15:11] == `BNZ) || (id_ir[15:11] == `BN) || (id_ir[15:11] == `BNN) || (id_ir[15:11] == `BC)
|| (id_ir[15:11] == `BNC) || (id_ir[15:11] == `JMPR) )
reg_B <= {8'b0000_0000, id_ir[7:0]};//reg_B 为{val2 + val3}的情况
else //reg_B 为r3的情况
begin
//******* 一阶数据相关 *******//
if((id_ir[2:0] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin reg_B <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[2:0] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
reg_B <= d_datain;
else
reg_B <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[2:0] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin reg_B <= reg_C1; end
else
begin reg_B <= gr[id_ir[2:0]]; end
if((mem_ir[15:11] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin reg_B <= 0; end
end
//------ smdr的赋值(包含Hazard处理) -------//
if (id_ir[15:11] == `STORE)
begin
//******* 一阶数据相关 (solution : data forwarding) *******//
if((id_ir[10:8] == ex_ir[10:8]) && (ex_ir[15:11] != `NOP) && (ex_ir[15:11] != `HALT) && (ex_ir[15:11] != `LOAD)
&& (ex_ir[15:11] != `CMP) && (ex_ir[15:11] != `JUMP) )
begin smdr <= ALUo; end
//******* 二阶数据相关 *******//
else if((id_ir[10:8] == mem_ir[10:8]) && (mem_ir[15:11] != `NOP) && (mem_ir[15:11] != `HALT)
&& (mem_ir[15:11] != `CMP) && (mem_ir[15:11] != `JUMP) )
begin
if(mem_ir[15:11] == `LOAD)
smdr <= d_datain;
else
smdr <= reg_C;
end
//******* 三阶数据相关 *******//
else if((id_ir[10:8] == wb_ir[10:8]) && (wb_ir[15:11] != `NOP) && (wb_ir[15:11] != `HALT)
&& (wb_ir[15:11] != `CMP) && (wb_ir[15:11] != `JUMP) )
begin smdr <= reg_C1; end
else
begin smdr <= gr[id_ir[10:8]]; end
if((mem_ir[10:8] == `JMPR) || ((mem_ir[15:11] == `BZ) && zf == 1'b1)
|| ((mem_ir[15:11] == `BNZ) && zf == 1'b0) || ((mem_ir[15:11] == `BN) && nf == 1'b1)
|| ((mem_ir[15:11] == `BNN) && nf == 1'b0) || ((mem_ir[15:11] == `BC) && cf == 1'b1)
|| ((mem_ir[15:11] == `BNC) && cf == 1'b0) )
begin smdr <= 0; end
end
else
smdr <= smdr;
end
end
//************* EX *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
mem_ir <= 0;
zf <= 0;
nf <= 0;
reg_C <= 0;
dw <= 0;
smdr1 <= 0;
end
else if (state == `exec)
begin
//*************** Bypassing ***************//
if(!bypassing)
begin
reg_C <= ALUo;
mem_ir <= ex_ir;
smdr1 <= smdr;
end
//--------------- Finish ------------------//
// mem_ir <= ex_ir;
// reg_C <= ALUo;
// smdr1 <= smdr;
if ( (ex_ir[15:11] == `CMP) )
begin
if (ALUo == 16'b0000_0000_0000_0000)
zf <= 1'b1;
else begin
zf <= 1'b0;
if (ALUo[15] == 1'b1)
nf <= 1'b1;
else
nf <= 1'b0;
end
end
else begin
nf <= nf;
zf <= zf;
end
//STORE指令
if (ex_ir[15:11] == `STORE)
dw <= 1'b1;
else
dw <= 1'b0;
end
end
//ALUo
reg signed [15:0] A_reg;//算术右移中间变量
always @(reg_A)
A_reg <= reg_A;
always @(*)
begin
case(ex_ir[15:11])
`LOAD : {cf,ALUo} <= reg_A + reg_B;
`STORE : {cf,ALUo} <= reg_A + reg_B;
`LDIH : {cf,ALUo} <= reg_A + reg_B;
`ADD : {cf,ALUo} <= reg_A + reg_B;
`ADDI : {cf,ALUo} <= reg_A + reg_B;
`ADDC : {cf,ALUo} <= reg_A + reg_B + cf;
`SUB : {cf,ALUo} <= reg_A - reg_B;
`SUBI : {cf,ALUo} <= reg_A - reg_B;
`SUBC : {cf,ALUo} <= reg_A - reg_B - cf;
`CMP : {cf,ALUo} <= reg_A - reg_B;// CMP的功能是set CF NF ZF
`AND : {cf,ALUo} <= reg_A & reg_B;
`OR : {cf,ALUo} <= reg_A | reg_B;
`XOR : {cf,ALUo} <= reg_A ^ reg_B;
`SLL : {cf,ALUo} <= reg_A << reg_B;//逻辑左移,低位补0
`SRL : {cf,ALUo} <= reg_A >> reg_B;//逻辑右移,高位补0
`SLA : {cf,ALUo} <= reg_A <<< reg_B;//算术左移等同于逻辑左移,低位补0
`SRA : {cf,ALUo} <= A_reg >>> reg_B;//算术右移,高位补符号位
`JMPR : {cf,ALUo} <= reg_A + reg_B;
`BZ : {cf,ALUo} <= reg_A + reg_B;
`BNZ : {cf,ALUo} <= reg_A + reg_B;
`BN : {cf,ALUo} <= reg_A + reg_B;
`BNN : {cf,ALUo} <= reg_A + reg_B;
`BC : {cf,ALUo} <= reg_A + reg_B;
`BNC : {cf,ALUo} <= reg_A + reg_B;
default:
begin cf <= cf; ALUo <= ALUo; end
endcase
end
//***************** judge whether bypassing ***************//
reg bypassing;
always @(posedge clk or negedge reset)
begin
if(!reset)
bypassing <= 0;
else if( (ex_ir[15:11] == `LDIH) || (ex_ir[15:11] == `ADD) || (ex_ir[15:11] == `ADDI) || (ex_ir[15:11] == `ADDC)
|| (ex_ir[15:11] == `SUB) || (ex_ir[15:11] == `SUBI) || (ex_ir[15:11] == `SUBC) || (ex_ir[15:11] == `AND)
|| (ex_ir[15:11] == `OR) || (ex_ir[15:11] == `XOR) || (ex_ir[15:11] == `SLL) || (ex_ir[15:11] == `SRL)
|| (ex_ir[15:11] == `SLA) || (ex_ir[15:11] == `SRA) )
begin
if(mem_ir == `LOAD)//上一条指令为LOAD指令时,由于LOAD指令需要五级流水线,如果寄存器提前赋值会出现冲突
bypassing <= 0;
else
bypassing <= 1'b1;
end
else
bypassing <= 0;
end
//************* MEM *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
wb_ir <= 0;
reg_C1 <= 0;
d_dataout <= 0;
d_we <= 0;
end
else if (state == `exec)
begin
d_addr <= reg_C[7:0];
d_dataout <= smdr1;
d_we <= dw;
//*************** Bypassing **************//
if(bypassing)
wb_ir <= ex_ir;
else
wb_ir <= mem_ir;
if(mem_ir[15:11] == `LOAD)
reg_C1 <= d_datain;
else if(bypassing)
reg_C1 <= ALUo;
else
reg_C1 <= reg_C;
end
end
//************* WB *************//
always @(posedge clk or negedge reset)
begin
if(!reset) begin
gr[0] <= 16'b0000_0000_0000_0000;
gr[1] <= 16'b0000_0000_0000_0000;
gr[2] <= 16'b0000_0000_0000_0000;
gr[3] <= 16'b0000_0000_0000_0000;
gr[4] <= 16'b0000_0000_0000_0000;
gr[5] <= 16'b0000_0000_0000_0000;
gr[6] <= 16'b0000_0000_0000_0000;
gr[7] <= 16'b0000_0000_0000_0000;
gr[wb_ir[10:8]] <= gr[wb_ir[10:8]];
end
else if (state == `exec)
begin
if ( (wb_ir[15:11] == `LOAD) || (wb_ir[15:11] == `LDIH) || (wb_ir[15:11] == `ADD) || (wb_ir[15:11] == `ADDI)
||(wb_ir[15:11] == `ADDC) || (wb_ir[15:11] == `SUB) || (wb_ir[15:11] == `SUBI) || (wb_ir[15:11] == `SUBC)
||(wb_ir[15:11] == `CMP) || (wb_ir[15:11] == `AND) || (wb_ir[15:11] == `OR) || (wb_ir[15:11] == `XOR)
||(wb_ir[15:11] == `SLL) || (wb_ir[15:11] == `SRL) || (wb_ir[15:11] == `SLA) || (wb_ir[15:11] == `SRA) )
gr[wb_ir[10:8]] <= reg_C1;
end
end
endmodule
优化前后的XPOWER测试结果比较:
Basic design:
With clock gating:
相关文章推荐
- 五级流水线CPU之低功耗设计(1)——旁路(By Passing)
- cpu五级流水线设计优化之低功耗
- 五级流水线CPU之低功耗设计(2)——门控(CLOCK GATE)
- 五级流水线CPU之低功耗设计 (二) :Clock Gating(门控)
- 数字电路设计之五级流水线设计(CPU)
- 数字电路设计之五级流水线设计(CPU)
- cpu五级流水线基础设计
- 低功耗设计二之Bypassing(旁路)
- 数字电路设计之低功耗设计方法六:旁路(by-passing)
- 五级流水线CPU
- 请问谁那有计算机系统结构的课程设计—设计一个流水线CPU。。
- 简单的MIPS5级流水线CPU设计
- 自己动手写CPU之第七阶段(5)——流水线暂停机制的设计与实现
- 指令集并行流水线CPU设计
- 自己动手写CPU之第七阶段(5)——流水线暂停机制的设计与实现
- Verilog 数字电路设计之带hazard的五级流水线CPU
- CPU五级流水线工程(带Hazard)
- 单周期CPU及流水线CPU设计(1)---logisim部件设计
- 16位5级流水线CPU设计
- JavaCard CPU的设计与FPGA实现