2021-06-04 09:06:35 +08:00
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( c ) 2020 - 2021 Institute of Computing Technology , Chinese Academy of Sciences
2021-07-24 23:26:38 +08:00
* Copyright ( c ) 2020 - 2021 Peng Cheng Laboratory
2021-06-04 09:06:35 +08:00
*
* XiangShan is licensed under Mulan PSL v2 .
* You can use this software according to the terms and conditions of the Mulan PSL v2 .
* You may obtain a copy of Mulan PSL v2 at :
* http : //license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS , WITHOUT WARRANTIES OF ANY KIND ,
* EITHER EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO NON - INFRINGEMENT ,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE .
*
* See the Mulan PSL v2 for more details .
***************************************************************************************/
2020-08-06 16:58:13 +08:00
package xiangshan.mem
2021-04-19 21:19:20 +08:00
import chipsalliance.rocketchip.config.Parameters
2020-08-06 16:58:13 +08:00
import chisel3._
import chisel3.util._
import utils._
2022-12-25 14:52:31 +08:00
import utility._
2021-12-09 20:15:29 +08:00
import xiangshan.ExceptionNO._
2020-08-06 16:58:13 +08:00
import xiangshan._
2021-10-11 21:56:10 +08:00
import xiangshan.backend.fu.PMPRespBundle
2020-10-25 13:24:10 +08:00
import xiangshan.cache._
2021-12-09 20:15:29 +08:00
import xiangshan.cache.mmu. { TlbCmd , TlbReq , TlbRequestIO , TlbResp }
2020-08-06 16:58:13 +08:00
2022-12-02 22:35:02 +08:00
class LoadToLsqFastIO ( implicit p : Parameters ) extends XSBundle {
val valid = Output ( Bool ( ) )
val ld_ld_check_ok = Output ( Bool ( ) )
2022-12-08 22:05:29 +08:00
val st_ld_check_ok = Output ( Bool ( ) )
2022-12-02 22:35:02 +08:00
val cache_bank_no_conflict = Output ( Bool ( ) )
val ld_idx = Output ( UInt ( log2Ceil ( LoadQueueSize ) . W ) )
}
class LoadToLsqSlowIO ( implicit p : Parameters ) extends XSBundle {
val valid = Output ( Bool ( ) )
val tlb_hited = Output ( Bool ( ) )
2022-12-08 22:05:29 +08:00
val st_ld_check_ok = Output ( Bool ( ) )
2022-12-02 22:35:02 +08:00
val cache_no_replay = Output ( Bool ( ) )
val forward_data_valid = Output ( Bool ( ) )
val ld_idx = Output ( UInt ( log2Ceil ( LoadQueueSize ) . W ) )
val data_invalid_sq_idx = Output ( UInt ( log2Ceil ( StoreQueueSize ) . W ) )
}
2021-04-19 21:19:20 +08:00
class LoadToLsqIO ( implicit p : Parameters ) extends XSBundle {
2022-10-13 15:57:25 +08:00
val loadIn = ValidIO ( new LqWriteBundle )
2022-08-09 19:18:07 +08:00
val loadPaddrIn = ValidIO ( new LqPaddrWriteBundle )
2022-12-02 22:35:02 +08:00
val loadVaddrIn = ValidIO ( new LqVaddrWriteBundle )
2020-08-06 16:58:13 +08:00
val ldout = Flipped ( DecoupledIO ( new ExuOutput ) )
2022-08-24 13:51:19 +08:00
val ldRawData = Input ( new LoadDataFromLQBundle )
2022-10-13 15:57:25 +08:00
val s2_load_data_forwarded = Output ( Bool ( ) )
val s3_delayed_load_error = Output ( Bool ( ) )
2022-11-18 14:52:30 +08:00
val s2_dcache_require_replay = Output ( Bool ( ) )
val s3_replay_from_fetch = Output ( Bool ( ) ) // update uop.ctrl.replayInst in load queue in s3
2021-04-30 10:40:51 +08:00
val forward = new PipeLoadForwardQueryIO
2021-10-22 15:29:07 +08:00
val loadViolationQuery = new LoadViolationQueryIO
2021-12-01 18:43:36 +08:00
val trigger = Flipped ( new LqTriggerIO )
2022-12-02 22:35:02 +08:00
// for load replay
val replayFast = new LoadToLsqFastIO
val replaySlow = new LoadToLsqSlowIO
2020-08-06 16:58:13 +08:00
}
2021-08-20 22:39:07 +08:00
class LoadToLoadIO ( implicit p : Parameters ) extends XSBundle {
2021-09-28 09:23:31 +08:00
// load to load fast path is limited to ld (64 bit) used as vaddr src1 only
2021-08-20 22:39:07 +08:00
val data = UInt ( XLEN . W )
val valid = Bool ( )
}
2021-12-01 18:43:36 +08:00
class LoadUnitTriggerIO ( implicit p : Parameters ) extends XSBundle {
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
val tdata2 = Input ( UInt ( 64. W ) )
val matchType = Input ( UInt ( 2. W ) )
2021-12-09 19:19:34 +08:00
val tEnable = Input ( Bool ( ) ) // timing is calculated before this
2021-12-01 18:43:36 +08:00
val addrHit = Output ( Bool ( ) )
val lastDataHit = Output ( Bool ( ) )
}
2020-10-17 21:05:46 +08:00
// Load Pipeline Stage 0
// Generate addr, use addr to query DCache and DTLB
2021-10-10 11:51:53 +08:00
class LoadUnit_S0 ( implicit p : Parameters ) extends XSModule with HasDCacheParameters {
2020-08-06 16:58:13 +08:00
val io = IO ( new Bundle ( ) {
2020-10-17 21:05:46 +08:00
val in = Flipped ( Decoupled ( new ExuInput ) )
val out = Decoupled ( new LsPipelineBundle )
2020-11-07 17:40:20 +08:00
val dtlbReq = DecoupledIO ( new TlbReq )
2021-01-08 17:17:13 +08:00
val dcacheReq = DecoupledIO ( new DCacheWordReq )
2021-01-30 17:31:42 +08:00
val rsIdx = Input ( UInt ( log2Up ( IssQueSize ) . W ) )
2021-03-13 08:49:36 +08:00
val isFirstIssue = Input ( Bool ( ) )
2022-08-22 19:02:28 +08:00
val fastpath = Input ( new LoadToLoadIO )
val s0_kill = Input ( Bool ( ) )
2022-12-02 22:35:02 +08:00
// wire from lq to load pipeline
val lsqOut = Flipped ( Decoupled ( new LsPipelineBundle ) )
val s0_sqIdx = Output ( new SqPtr )
2020-08-06 16:58:13 +08:00
} )
2021-09-02 09:18:01 +08:00
require ( LoadPipelineWidth == exuParameters . LduCnt )
2020-08-06 16:58:13 +08:00
2022-12-02 22:35:02 +08:00
// there are three sources of load pipeline's input
// * 1. load issued by RS (io.in)
// * 2. load replayed by LSQ (io.lsqOut)
// * 3. load try pointchaising when no issued or replayed load (io.fastpath)
2021-08-20 22:39:07 +08:00
2022-12-02 22:35:02 +08:00
// the priority is
// 1 > 2 > 3
// now in S0, choise a load according to priority
val s0_vaddr = Wire ( UInt ( VAddrBits . W ) )
val s0_mask = Wire ( UInt ( 8. W ) )
val s0_uop = Wire ( new MicroOp )
val s0_isFirstIssue = Wire ( Bool ( ) )
val s0_rsIdx = Wire ( UInt ( log2Up ( IssQueSize ) . W ) )
val s0_sqIdx = Wire ( new SqPtr )
io . s0_sqIdx : = s0_sqIdx
val tryFastpath = WireInit ( false . B )
val s0_valid = Wire ( Bool ( ) )
s0_valid : = io . in . valid || io . lsqOut . valid || tryFastpath
// assign default value
s0_uop : = DontCare
when ( io . in . valid ) {
val imm12 = io . in . bits . uop . ctrl . imm ( 11 , 0 )
s0_vaddr : = io . in . bits . src ( 0 ) + SignExt ( imm12 , VAddrBits )
s0_mask : = genWmask ( s0_vaddr , io . in . bits . uop . ctrl . fuOpType ( 1 , 0 ) )
s0_uop : = io . in . bits . uop
s0_isFirstIssue : = io . isFirstIssue
s0_rsIdx : = io . rsIdx
s0_sqIdx : = io . in . bits . uop . sqIdx
} . elsewhen ( io . lsqOut . valid ) {
s0_vaddr : = io . lsqOut . bits . vaddr
s0_mask : = io . lsqOut . bits . mask
s0_uop : = io . lsqOut . bits . uop
s0_isFirstIssue : = io . lsqOut . bits . isFirstIssue
s0_rsIdx : = io . lsqOut . bits . rsIdx
s0_sqIdx : = io . lsqOut . bits . uop . sqIdx
} . otherwise {
if ( EnableLoadToLoadForward ) {
tryFastpath : = io . fastpath . valid
// When there's no valid instruction from RS and LSQ, we try the load-to-load forwarding.
2022-08-22 19:02:28 +08:00
s0_vaddr : = io . fastpath . data
// Assume the pointer chasing is always ld.
s0_uop . ctrl . fuOpType : = LSUOpType . ld
s0_mask : = genWmask ( 0. U , LSUOpType . ld )
2022-12-02 22:35:02 +08:00
// we dont care s0_isFirstIssue and s0_rsIdx and s0_sqIdx in S0 when trying pointchasing
// because these signals will be updated in S1
s0_isFirstIssue : = DontCare
s0_rsIdx : = DontCare
s0_sqIdx : = DontCare
2022-08-22 19:02:28 +08:00
}
2021-11-30 20:27:16 +08:00
}
2020-10-17 21:05:46 +08:00
2022-12-02 22:35:02 +08:00
val addrAligned = LookupTree ( s0_uop . ctrl . fuOpType ( 1 , 0 ) , List (
"b00" . U -> true . B , //b
"b01" . U -> ( s0_vaddr ( 0 ) === 0. U ) , //h
"b10" . U -> ( s0_vaddr ( 1 , 0 ) === 0. U ) , //w
"b11" . U -> ( s0_vaddr ( 2 , 0 ) === 0. U ) //d
) )
// io.in has highest priority
io . in . ready : = ! io . in . valid || ( io . out . ready && io . dcacheReq . ready )
// io.lsqOut can fire only when there in no RS-issued load
io . lsqOut . ready : = ( io . out . ready && io . dcacheReq . ready && ! io . in . valid )
2021-10-27 14:45:39 +08:00
val isSoftPrefetch = LSUOpType . isPrefetch ( s0_uop . ctrl . fuOpType )
val isSoftPrefetchRead = s0_uop . ctrl . fuOpType === LSUOpType . prefetch_r
val isSoftPrefetchWrite = s0_uop . ctrl . fuOpType === LSUOpType . prefetch_w
2021-10-10 11:51:53 +08:00
2020-10-17 21:05:46 +08:00
// query DTLB
2022-12-02 22:35:02 +08:00
io . dtlbReq . valid : = s0_valid
2020-10-25 13:24:10 +08:00
io . dtlbReq . bits . vaddr : = s0_vaddr
io . dtlbReq . bits . cmd : = TlbCmd . read
2022-08-22 19:02:28 +08:00
io . dtlbReq . bits . size : = LSUOpType . size ( s0_uop . ctrl . fuOpType )
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
io . dtlbReq . bits . kill : = DontCare
io . dtlbReq . bits . debug . robIdx : = s0_uop . robIdx
2020-10-25 13:24:10 +08:00
io . dtlbReq . bits . debug . pc : = s0_uop . cf . pc
2022-12-02 22:35:02 +08:00
io . dtlbReq . bits . debug . isFirstIssue : = s0_isFirstIssue
2020-10-17 21:05:46 +08:00
// query DCache
2022-12-02 22:35:02 +08:00
io . dcacheReq . valid : = s0_valid
2021-10-10 11:51:53 +08:00
when ( isSoftPrefetchRead ) {
io . dcacheReq . bits . cmd : = MemoryOpConstants . M_PFR
} . elsewhen ( isSoftPrefetchWrite ) {
io . dcacheReq . bits . cmd : = MemoryOpConstants . M_PFW
} . otherwise {
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
io . dcacheReq . bits . cmd : = MemoryOpConstants . M_XRD
2021-10-10 11:51:53 +08:00
}
2020-10-25 13:24:10 +08:00
io . dcacheReq . bits . addr : = s0_vaddr
io . dcacheReq . bits . mask : = s0_mask
2020-10-26 12:11:38 +08:00
io . dcacheReq . bits . data : = DontCare
2021-10-10 11:51:53 +08:00
when ( isSoftPrefetch ) {
io . dcacheReq . bits . instrtype : = SOFT_PREFETCH . U
} . otherwise {
io . dcacheReq . bits . instrtype : = LOAD_SOURCE . U
}
2020-10-26 12:11:38 +08:00
// TODO: update cache meta
2021-01-24 10:18:49 +08:00
io . dcacheReq . bits . id : = DontCare
2020-10-17 21:05:46 +08:00
2022-12-02 22:35:02 +08:00
io . out . valid : = s0_valid && io . dcacheReq . ready && ! io . s0_kill
2020-12-11 19:59:25 +08:00
2020-10-17 21:05:46 +08:00
io . out . bits : = DontCare
io . out . bits . vaddr : = s0_vaddr
io . out . bits . mask : = s0_mask
io . out . bits . uop : = s0_uop
io . out . bits . uop . cf . exceptionVec ( loadAddrMisaligned ) : = ! addrAligned
2022-12-02 22:35:02 +08:00
io . out . bits . rsIdx : = s0_rsIdx
io . out . bits . isFirstIssue : = s0_isFirstIssue
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
io . out . bits . isSoftPrefetch : = isSoftPrefetch
2022-12-02 22:35:02 +08:00
io . out . bits . isLoadReplay : = ! io . in . valid && io . lsqOut . valid
2020-10-27 18:11:11 +08:00
2022-08-22 19:02:28 +08:00
XSDebug ( io . dcacheReq . fire ,
2020-12-13 21:31:00 +08:00
p "[DCACHE LOAD REQ] pc ${Hexadecimal(s0_uop.cf.pc)}, vaddr ${Hexadecimal(s0_vaddr)}\n"
2020-10-27 18:11:11 +08:00
)
2021-09-22 15:48:08 +08:00
XSPerfAccumulate ( "in_valid" , io . in . valid )
XSPerfAccumulate ( "in_fire" , io . in . fire )
XSPerfAccumulate ( "in_fire_first_issue" , io . in . valid && io . isFirstIssue )
2021-03-25 21:08:52 +08:00
XSPerfAccumulate ( "stall_out" , io . out . valid && ! io . out . ready && io . dcacheReq . ready )
XSPerfAccumulate ( "stall_dcache" , io . out . valid && io . out . ready && ! io . dcacheReq . ready )
2022-08-22 19:02:28 +08:00
XSPerfAccumulate ( "addr_spec_success" , io . out . fire && s0_vaddr ( VAddrBits - 1 , 12 ) === io . in . bits . src ( 0 ) ( VAddrBits - 1 , 12 ) )
XSPerfAccumulate ( "addr_spec_failed" , io . out . fire && s0_vaddr ( VAddrBits - 1 , 12 ) =/= io . in . bits . src ( 0 ) ( VAddrBits - 1 , 12 ) )
XSPerfAccumulate ( "addr_spec_success_once" , io . out . fire && s0_vaddr ( VAddrBits - 1 , 12 ) === io . in . bits . src ( 0 ) ( VAddrBits - 1 , 12 ) && io . isFirstIssue )
XSPerfAccumulate ( "addr_spec_failed_once" , io . out . fire && s0_vaddr ( VAddrBits - 1 , 12 ) =/= io . in . bits . src ( 0 ) ( VAddrBits - 1 , 12 ) && io . isFirstIssue )
2020-10-17 21:05:46 +08:00
}
// Load Pipeline Stage 1
// TLB resp (send paddr to dcache)
2022-12-08 22:05:29 +08:00
class LoadUnit_S1 ( implicit p : Parameters ) extends XSModule with HasCircularQueuePtrHelper {
2020-10-17 21:05:46 +08:00
val io = IO ( new Bundle ( ) {
val in = Flipped ( Decoupled ( new LsPipelineBundle ) )
2022-08-22 19:02:28 +08:00
val s1_kill = Input ( Bool ( ) )
2020-10-17 21:05:46 +08:00
val out = Decoupled ( new LsPipelineBundle )
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
val dtlbResp = Flipped ( DecoupledIO ( new TlbResp ( 2 ) ) )
val lsuPAddr = Output ( UInt ( PAddrBits . W ) )
2020-12-13 21:31:00 +08:00
val dcachePAddr = Output ( UInt ( PAddrBits . W ) )
2021-01-08 20:49:30 +08:00
val dcacheKill = Output ( Bool ( ) )
2021-10-11 16:01:26 +08:00
val dcacheBankConflict = Input ( Bool ( ) )
2021-08-20 01:27:12 +08:00
val fullForwardFast = Output ( Bool ( ) )
2020-11-02 19:23:04 +08:00
val sbuffer = new LoadForwardQueryIO
2021-04-30 10:40:51 +08:00
val lsq = new PipeLoadForwardQueryIO
2021-10-22 15:29:07 +08:00
val loadViolationQueryReq = Decoupled ( new LoadViolationQueryReq )
2022-12-08 22:05:29 +08:00
val reExecuteQuery = Flipped ( Vec ( StorePipelineWidth , Valid ( new LoadReExecuteQueryIO ) ) )
2021-10-11 16:01:26 +08:00
val rsFeedback = ValidIO ( new RSFeedback )
2022-12-02 22:35:02 +08:00
val replayFast = new LoadToLsqFastIO
2021-10-22 15:29:07 +08:00
val csrCtrl = Flipped ( new CustomCSRCtrlIO )
val needLdVioCheckRedo = Output ( Bool ( ) )
2022-12-08 22:05:29 +08:00
val needReExecute = Output ( Bool ( ) )
2020-08-06 16:58:13 +08:00
} )
2020-08-16 15:59:15 +08:00
2020-10-17 21:05:46 +08:00
val s1_uop = io . in . bits . uop
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
val s1_paddr_dup_lsu = io . dtlbResp . bits . paddr ( 0 )
val s1_paddr_dup_dcache = io . dtlbResp . bits . paddr ( 1 )
2021-12-09 20:15:29 +08:00
// af & pf exception were modified below.
val s1_exception = ExceptionNO . selectByFu ( io . out . bits . uop . cf . exceptionVec , lduCfg ) . asUInt . orR
2020-12-13 21:31:00 +08:00
val s1_tlb_miss = io . dtlbResp . bits . miss
2020-11-02 19:23:04 +08:00
val s1_mask = io . in . bits . mask
2021-10-11 16:01:26 +08:00
val s1_bank_conflict = io . dcacheBankConflict
2020-11-18 20:47:14 +08:00
2020-11-02 19:23:04 +08:00
io . out . bits : = io . in . bits // forwardXX field will be updated in s1
2020-12-13 21:31:00 +08:00
io . dtlbResp . ready : = true . B
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . lsuPAddr : = s1_paddr_dup_lsu
io . dcachePAddr : = s1_paddr_dup_dcache
2021-10-10 11:51:53 +08:00
//io.dcacheKill := s1_tlb_miss || s1_exception || s1_mmio
2022-08-22 19:02:28 +08:00
io . dcacheKill : = s1_tlb_miss || s1_exception || io . s1_kill
2020-11-02 19:23:04 +08:00
// load forward query datapath
2022-08-22 19:02:28 +08:00
io . sbuffer . valid : = io . in . valid && ! ( s1_exception || s1_tlb_miss || io . s1_kill )
2021-07-29 16:18:24 +08:00
io . sbuffer . vaddr : = io . in . bits . vaddr
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . sbuffer . paddr : = s1_paddr_dup_lsu
2020-11-02 19:23:04 +08:00
io . sbuffer . uop : = s1_uop
io . sbuffer . sqIdx : = s1_uop . sqIdx
io . sbuffer . mask : = s1_mask
io . sbuffer . pc : = s1_uop . cf . pc // FIXME: remove it
2020-11-18 20:47:14 +08:00
2022-08-22 19:02:28 +08:00
io . lsq . valid : = io . in . valid && ! ( s1_exception || s1_tlb_miss || io . s1_kill )
2021-07-29 16:18:24 +08:00
io . lsq . vaddr : = io . in . bits . vaddr
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . lsq . paddr : = s1_paddr_dup_lsu
2020-11-18 20:47:14 +08:00
io . lsq . uop : = s1_uop
io . lsq . sqIdx : = s1_uop . sqIdx
2021-02-02 00:17:52 +08:00
io . lsq . sqIdxMask : = DontCare // will be overwritten by sqIdxMask pre-generated in s0
2020-11-18 20:47:14 +08:00
io . lsq . mask : = s1_mask
io . lsq . pc : = s1_uop . cf . pc // FIXME: remove it
2020-11-02 19:23:04 +08:00
2021-10-22 15:29:07 +08:00
// ld-ld violation query
2022-08-22 19:02:28 +08:00
io . loadViolationQueryReq . valid : = io . in . valid && ! ( s1_exception || s1_tlb_miss || io . s1_kill )
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . loadViolationQueryReq . bits . paddr : = s1_paddr_dup_lsu
2021-10-22 15:29:07 +08:00
io . loadViolationQueryReq . bits . uop : = s1_uop
2022-12-08 22:05:29 +08:00
// st-ld violation query
val needReExecuteVec = Wire ( Vec ( StorePipelineWidth , Bool ( ) ) )
val needReExecute = Wire ( Bool ( ) )
for ( w <- 0 until StorePipelineWidth ) {
// needReExecute valid when
// 1. ReExecute query request valid.
// 2. Load instruction is younger than requestors(store instructions).
// 3. Physical address match.
// 4. Data contains.
needReExecuteVec ( w ) : = io . reExecuteQuery ( w ) . valid &&
isAfter ( io . in . bits . uop . robIdx , io . reExecuteQuery ( w ) . bits . robIdx ) &&
! s1_tlb_miss &&
( s1_paddr_dup_lsu ( PAddrBits - 1 , 3 ) === io . reExecuteQuery ( w ) . bits . paddr ( PAddrBits - 1 , 3 ) ) &&
( s1_mask & io . reExecuteQuery ( w ) . bits . mask ) . orR
}
needReExecute : = needReExecuteVec . asUInt . orR
io . needReExecute : = needReExecute
2021-08-20 01:27:12 +08:00
// Generate forwardMaskFast to wake up insts earlier
val forwardMaskFast = io . lsq . forwardMaskFast . asUInt | io . sbuffer . forwardMaskFast . asUInt
2022-08-22 19:02:28 +08:00
io . fullForwardFast : = ( ( ~ forwardMaskFast ) . asUInt & s1_mask ) === 0. U
2021-08-20 01:27:12 +08:00
2021-10-22 15:29:07 +08:00
// Generate feedback signal caused by:
// * dcache bank conflict
// * need redo ld-ld violation check
val needLdVioCheckRedo = io . loadViolationQueryReq . valid &&
! io . loadViolationQueryReq . ready &&
2021-12-20 15:32:19 +08:00
RegNext ( io . csrCtrl . ldld_vio_check_enable )
2021-10-22 15:29:07 +08:00
io . needLdVioCheckRedo : = needLdVioCheckRedo
2022-12-02 22:35:02 +08:00
// io.rsFeedback.valid := io.in.valid && (s1_bank_conflict || needLdVioCheckRedo) && !io.s1_kill
io . rsFeedback . valid : = Mux ( io . in . bits . isLoadReplay , false . B , io . in . valid && ! io . s1_kill )
io . rsFeedback . bits . hit : = true . B // we have found s1_bank_conflict / re do ld-ld violation check
2021-10-11 16:01:26 +08:00
io . rsFeedback . bits . rsIdx : = io . in . bits . rsIdx
io . rsFeedback . bits . flushState : = io . in . bits . ptwBack
2021-10-22 15:29:07 +08:00
io . rsFeedback . bits . sourceType : = Mux ( s1_bank_conflict , RSFeedbackType . bankConflict , RSFeedbackType . ldVioCheckRedo )
2021-10-12 19:51:51 +08:00
io . rsFeedback . bits . dataInvalidSqIdx : = DontCare
2021-10-11 16:01:26 +08:00
2022-12-02 22:35:02 +08:00
io . replayFast . valid : = io . in . valid && ! io . s1_kill
io . replayFast . ld_ld_check_ok : = ! needLdVioCheckRedo
2022-12-08 22:05:29 +08:00
io . replayFast . st_ld_check_ok : = ! needReExecute
2022-12-02 22:35:02 +08:00
io . replayFast . cache_bank_no_conflict : = ! s1_bank_conflict
io . replayFast . ld_idx : = io . in . bits . uop . lqIdx . value
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
// if replay is detected in load_s1,
// load inst will be canceled immediately
2022-12-08 22:05:29 +08:00
io . out . valid : = io . in . valid && ( ! needLdVioCheckRedo && ! s1_bank_conflict && ! needReExecute ) && ! io . s1_kill
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . out . bits . paddr : = s1_paddr_dup_lsu
2020-10-26 12:11:38 +08:00
io . out . bits . tlbMiss : = s1_tlb_miss
2021-10-10 11:51:53 +08:00
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
// current ori test will cause the case of ldest == 0, below will be modifeid in the future.
2021-10-10 11:51:53 +08:00
// af & pf exception were modified
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . out . bits . uop . cf . exceptionVec ( loadPageFault ) : = io . dtlbResp . bits . excp ( 0 ) . pf . ld
io . out . bits . uop . cf . exceptionVec ( loadAccessFault ) : = io . dtlbResp . bits . excp ( 0 ) . af . ld
2021-10-10 11:51:53 +08:00
2021-03-05 20:23:11 +08:00
io . out . bits . ptwBack : = io . dtlbResp . bits . ptwBack
2021-01-30 17:31:42 +08:00
io . out . bits . rsIdx : = io . in . bits . rsIdx
2020-08-06 16:58:13 +08:00
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
io . out . bits . isSoftPrefetch : = io . in . bits . isSoftPrefetch
2021-10-10 11:51:53 +08:00
2020-12-11 19:59:25 +08:00
io . in . ready : = ! io . in . valid || io . out . ready
2020-08-19 15:28:58 +08:00
2021-09-22 15:48:08 +08:00
XSPerfAccumulate ( "in_valid" , io . in . valid )
XSPerfAccumulate ( "in_fire" , io . in . fire )
XSPerfAccumulate ( "in_fire_first_issue" , io . in . fire && io . in . bits . isFirstIssue )
XSPerfAccumulate ( "tlb_miss" , io . in . fire && s1_tlb_miss )
XSPerfAccumulate ( "tlb_miss_first_issue" , io . in . fire && s1_tlb_miss && io . in . bits . isFirstIssue )
2021-03-25 21:08:52 +08:00
XSPerfAccumulate ( "stall_out" , io . out . valid && ! io . out . ready )
2020-10-17 21:05:46 +08:00
}
2020-08-19 15:28:58 +08:00
2020-10-17 21:05:46 +08:00
// Load Pipeline Stage 2
// DCache resp
2022-12-08 22:05:29 +08:00
class LoadUnit_S2 ( implicit p : Parameters ) extends XSModule with HasLoadHelper with HasCircularQueuePtrHelper {
2020-10-17 21:05:46 +08:00
val io = IO ( new Bundle ( ) {
val in = Flipped ( Decoupled ( new LsPipelineBundle ) )
val out = Decoupled ( new LsPipelineBundle )
2021-04-30 10:40:51 +08:00
val rsFeedback = ValidIO ( new RSFeedback )
2022-12-02 22:35:02 +08:00
val replaySlow = new LoadToLsqSlowIO
2022-09-22 08:56:44 +08:00
val dcacheResp = Flipped ( DecoupledIO ( new BankedDCacheWordResp ) )
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
val pmpResp = Flipped ( new PMPRespBundle ( ) )
2020-12-02 18:16:42 +08:00
val lsq = new LoadForwardQueryIO
2021-10-12 19:51:51 +08:00
val dataInvalidSqIdx = Input ( UInt ( ) )
2020-12-16 14:44:10 +08:00
val sbuffer = new LoadForwardQueryIO
2021-01-25 19:46:19 +08:00
val dataForwarded = Output ( Bool ( ) )
2022-11-18 14:52:30 +08:00
val s2_dcache_require_replay = Output ( Bool ( ) )
2021-10-23 13:38:45 +08:00
val fullForward = Output ( Bool ( ) )
2021-10-11 21:56:10 +08:00
val dcache_kill = Output ( Bool ( ) )
2022-10-13 15:57:25 +08:00
val s3_delayed_load_error = Output ( Bool ( ) )
2021-10-22 15:29:07 +08:00
val loadViolationQueryResp = Flipped ( Valid ( new LoadViolationQueryResp ) )
val csrCtrl = Flipped ( new CustomCSRCtrlIO )
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
val sentFastUop = Input ( Bool ( ) )
2021-12-20 15:32:19 +08:00
val static_pm = Input ( Valid ( Bool ( ) ) ) // valid for static, bits for mmio
2022-11-18 14:52:30 +08:00
val s2_can_replay_from_fetch = Output ( Bool ( ) ) // dirty code
2022-08-24 13:51:19 +08:00
val loadDataFromDcache = Output ( new LoadDataFromDcacheBundle )
2022-12-08 22:05:29 +08:00
val reExecuteQuery = Flipped ( Vec ( StorePipelineWidth , Valid ( new LoadReExecuteQueryIO ) ) )
val needReExecute = Output ( Bool ( ) )
2022-12-02 22:35:02 +08:00
// val write_lq_safe = Output(Bool()) // used by duplicate wen signals
2020-10-17 21:05:46 +08:00
} )
2021-12-20 15:32:19 +08:00
val pmp = WireInit ( io . pmpResp )
when ( io . static_pm . valid ) {
pmp . ld : = false . B
pmp . st : = false . B
pmp . instr : = false . B
pmp . mmio : = io . static_pm . bits
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
}
2021-10-11 21:56:10 +08:00
2021-12-20 15:32:19 +08:00
val s2_is_prefetch = io . in . bits . isSoftPrefetch
// exception that may cause load addr to be invalid / illegal
//
// if such exception happen, that inst and its exception info
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
// will be force writebacked to rob
2021-12-20 15:32:19 +08:00
val s2_exception_vec = WireInit ( io . in . bits . uop . cf . exceptionVec )
s2_exception_vec ( loadAccessFault ) : = io . in . bits . uop . cf . exceptionVec ( loadAccessFault ) || pmp . ld
// soft prefetch will not trigger any exception (but ecc error interrupt may be triggered)
when ( s2_is_prefetch ) {
s2_exception_vec : = 0. U . asTypeOf ( s2_exception_vec . cloneType )
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
}
2022-12-02 22:35:02 +08:00
val s2_exception = ExceptionNO . selectByFu ( s2_exception_vec , lduCfg ) . asUInt . orR && ! io . in . bits . tlbMiss // ????????
2021-12-20 15:32:19 +08:00
2022-06-28 13:47:21 +08:00
// writeback access fault caused by ecc error / bus error
2021-12-20 15:32:19 +08:00
//
2022-06-28 13:47:21 +08:00
// * ecc data error is slow to generate, so we will not use it until load stage 3
2022-08-22 19:02:28 +08:00
// * in load stage 3, an extra signal io.load_error will be used to
2022-06-28 13:47:21 +08:00
2021-12-20 15:32:19 +08:00
// now cache ecc error will raise an access fault
// at the same time, error info (including error paddr) will be write to
// an customized CSR "CACHE_ERROR"
2022-06-28 13:47:21 +08:00
if ( EnableAccurateLoadError ) {
2022-10-13 15:57:25 +08:00
io . s3_delayed_load_error : = io . dcacheResp . bits . error_delayed &&
2022-08-22 19:02:28 +08:00
io . csrCtrl . cache_error_enable &&
2022-06-28 13:47:21 +08:00
RegNext ( io . out . valid )
} else {
2022-10-13 15:57:25 +08:00
io . s3_delayed_load_error : = false . B
2022-06-28 13:47:21 +08:00
}
2021-12-20 15:32:19 +08:00
val actually_mmio = pmp . mmio
2020-10-17 21:05:46 +08:00
val s2_uop = io . in . bits . uop
val s2_mask = io . in . bits . mask
val s2_paddr = io . in . bits . paddr
2021-01-08 20:49:30 +08:00
val s2_tlb_miss = io . in . bits . tlbMiss
2021-12-20 15:32:19 +08:00
val s2_mmio = ! s2_is_prefetch && actually_mmio && ! s2_exception
2020-10-25 13:24:10 +08:00
val s2_cache_miss = io . dcacheResp . bits . miss
2021-01-08 17:17:13 +08:00
val s2_cache_replay = io . dcacheResp . bits . replay
2022-02-13 08:29:29 +08:00
val s2_cache_tag_error = io . dcacheResp . bits . tag_error
2021-08-03 14:28:43 +08:00
val s2_forward_fail = io . lsq . matchInvalid || io . sbuffer . matchInvalid
2021-12-22 16:54:40 +08:00
val s2_ldld_violation = io . loadViolationQueryResp . valid &&
io . loadViolationQueryResp . bits . have_violation &&
RegNext ( io . csrCtrl . ldld_vio_check_enable )
2022-11-18 14:52:30 +08:00
val s2_data_invalid = io . lsq . dataInvalid && ! s2_ldld_violation && ! s2_exception
2021-12-22 16:54:40 +08:00
io . dcache_kill : = pmp . ld || pmp . mmio // move pmp resp kill to outside
2020-10-25 13:24:10 +08:00
io . dcacheResp . ready : = true . B
2021-10-27 14:45:39 +08:00
val dcacheShouldResp = ! ( s2_tlb_miss || s2_exception || s2_mmio || s2_is_prefetch )
assert ( ! ( io . in . valid && ( dcacheShouldResp && ! io . dcacheResp . valid ) ) , "DCache response got lost" )
2020-08-06 16:58:13 +08:00
2021-02-01 23:59:58 +08:00
// merge forward result
// lsq has higher priority than sbuffer
val forwardMask = Wire ( Vec ( 8 , Bool ( ) ) )
val forwardData = Wire ( Vec ( 8 , UInt ( 8. W ) ) )
2021-01-08 20:49:30 +08:00
2022-08-22 19:02:28 +08:00
val fullForward = ( ( ~ forwardMask . asUInt ) . asUInt & s2_mask ) === 0. U && ! io . lsq . dataInvalid
2021-02-01 23:59:58 +08:00
io . lsq : = DontCare
io . sbuffer : = DontCare
2021-10-23 13:38:45 +08:00
io . fullForward : = fullForward
2021-02-01 23:59:58 +08:00
// generate XLEN/8 Muxs
for ( i <- 0 until XLEN / 8 ) {
forwardMask ( i ) : = io . lsq . forwardMask ( i ) || io . sbuffer . forwardMask ( i )
forwardData ( i ) : = Mux ( io . lsq . forwardMask ( i ) , io . lsq . forwardData ( i ) , io . sbuffer . forwardData ( i ) )
}
2020-08-06 16:58:13 +08:00
2022-08-22 19:02:28 +08:00
XSDebug ( io . out . fire , "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n" ,
2020-12-02 18:16:42 +08:00
s2_uop . cf . pc ,
io . lsq . forwardData . asUInt , io . lsq . forwardMask . asUInt ,
io . in . bits . forwardData . asUInt , io . in . bits . forwardMask . asUInt
)
2020-08-06 16:58:13 +08:00
// data merge
2022-09-22 08:56:44 +08:00
// val rdataVec = VecInit((0 until XLEN / 8).map(j =>
// Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j))
// )) // s2_rdataVec will be write to load queue
// val rdata = rdataVec.asUInt
// val rdataSel = LookupTree(s2_paddr(2, 0), List(
// "b000".U -> rdata(63, 0),
// "b001".U -> rdata(63, 8),
// "b010".U -> rdata(63, 16),
// "b011".U -> rdata(63, 24),
// "b100".U -> rdata(63, 32),
// "b101".U -> rdata(63, 40),
// "b110".U -> rdata(63, 48),
// "b111".U -> rdata(63, 56)
// ))
// val rdataPartialLoad = rdataHelper(s2_uop, rdataSel) // s2_rdataPartialLoad is not used
2020-08-06 16:58:13 +08:00
2022-12-08 22:05:29 +08:00
io . out . valid : = io . in . valid && ! s2_tlb_miss && ! s2_data_invalid && ! io . needReExecute
2022-12-02 22:35:02 +08:00
// write_lq_safe is needed by dup logic
// io.write_lq_safe := !s2_tlb_miss && !s2_data_invalid
2020-11-18 20:47:14 +08:00
// Inst will be canceled in store queue / lsq,
2020-10-26 14:36:09 +08:00
// so we do not need to care about flush in load / store unit's out.valid
2020-10-17 21:05:46 +08:00
io . out . bits : = io . in . bits
2022-08-24 13:51:19 +08:00
// io.out.bits.data := rdataPartialLoad
io . out . bits . data : = 0. U // data will be generated in load_s3
2021-09-28 09:23:31 +08:00
// when exception occurs, set it to not miss and let it write back to rob (via int port)
2021-08-20 01:27:12 +08:00
if ( EnableFastForward ) {
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
io . out . bits . miss : = s2_cache_miss &&
! s2_exception &&
2021-10-27 14:45:39 +08:00
! fullForward &&
! s2_is_prefetch
2021-08-20 01:27:12 +08:00
} else {
2021-10-27 14:45:39 +08:00
io . out . bits . miss : = s2_cache_miss &&
! s2_exception &&
! s2_is_prefetch
2021-08-20 01:27:12 +08:00
}
2021-01-15 22:24:39 +08:00
io . out . bits . uop . ctrl . fpWen : = io . in . bits . uop . ctrl . fpWen && ! s2_exception
2022-08-24 13:51:19 +08:00
2022-09-22 08:56:44 +08:00
io . loadDataFromDcache . bankedDcacheData : = io . dcacheResp . bits . bank_data
io . loadDataFromDcache . bank_oh : = io . dcacheResp . bits . bank_oh
// io.loadDataFromDcache.dcacheData := io.dcacheResp.bits.data
2022-08-24 13:51:19 +08:00
io . loadDataFromDcache . forwardMask : = forwardMask
io . loadDataFromDcache . forwardData : = forwardData
io . loadDataFromDcache . uop : = io . out . bits . uop
io . loadDataFromDcache . addrOffset : = s2_paddr ( 2 , 0 )
2022-11-18 14:52:30 +08:00
io . s2_can_replay_from_fetch : = ! s2_mmio && ! s2_is_prefetch && ! s2_tlb_miss
2021-10-22 15:29:07 +08:00
// if forward fail, replay this inst from fetch
2022-11-18 14:52:30 +08:00
val debug_forwardFailReplay = s2_forward_fail && ! s2_mmio && ! s2_is_prefetch && ! s2_tlb_miss
2021-10-22 15:29:07 +08:00
// if ld-ld violation is detected, replay from this inst from fetch
2022-11-18 14:52:30 +08:00
val debug_ldldVioReplay = s2_ldld_violation && ! s2_mmio && ! s2_is_prefetch && ! s2_tlb_miss
// io.out.bits.uop.ctrl.replayInst := false.B
2021-01-11 22:00:04 +08:00
io . out . bits . mmio : = s2_mmio
2021-12-09 20:15:29 +08:00
io . out . bits . uop . ctrl . flushPipe : = s2_mmio && io . sentFastUop
2022-06-28 13:47:21 +08:00
io . out . bits . uop . cf . exceptionVec : = s2_exception_vec // cache error not included
2021-09-28 09:23:31 +08:00
2021-08-20 01:27:12 +08:00
// For timing reasons, sometimes we can not let
2021-01-25 19:46:19 +08:00
// io.out.bits.miss := s2_cache_miss && !s2_exception && !fullForward
2022-02-13 08:29:29 +08:00
// We use io.dataForwarded instead. It means:
// 1. Forward logic have prepared all data needed,
// and dcache query is no longer needed.
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
// 2. ... or data cache tag error is detected, this kind of inst
2022-02-13 08:29:29 +08:00
// will not update miss queue. That is to say, if miss, that inst
// may not be refilled
2021-01-25 19:46:19 +08:00
// Such inst will be writebacked from load queue.
2022-11-18 14:52:30 +08:00
io . dataForwarded : = s2_cache_miss && ! s2_exception &&
2022-02-16 10:25:53 +08:00
( fullForward || io . csrCtrl . cache_error_enable && s2_cache_tag_error )
2021-02-01 23:59:58 +08:00
// io.out.bits.forwardX will be send to lq
2021-03-05 20:23:11 +08:00
io . out . bits . forwardMask : = forwardMask
2022-08-31 12:18:59 +08:00
// data from dcache is not included in io.out.bits.forwardData
io . out . bits . forwardData : = forwardData
2021-01-25 19:46:19 +08:00
2020-10-17 21:05:46 +08:00
io . in . ready : = io . out . ready || ! io . in . valid
2022-12-08 22:05:29 +08:00
// st-ld violation query
val needReExecuteVec = Wire ( Vec ( StorePipelineWidth , Bool ( ) ) )
val needReExecute = Wire ( Bool ( ) )
for ( i <- 0 until StorePipelineWidth ) {
// NeedFastRecovery Valid when
// 1. Fast recovery query request Valid.
// 2. Load instruction is younger than requestors(store instructions).
// 3. Physical address match.
// 4. Data contains.
needReExecuteVec ( i ) : = io . reExecuteQuery ( i ) . valid &&
isAfter ( io . in . bits . uop . robIdx , io . reExecuteQuery ( i ) . bits . robIdx ) &&
! s2_tlb_miss &&
( s2_paddr ( PAddrBits - 1 , 3 ) === io . reExecuteQuery ( i ) . bits . paddr ( PAddrBits - 1 , 3 ) ) &&
( s2_mask & io . reExecuteQuery ( i ) . bits . mask ) . orR
}
needReExecute : = needReExecuteVec . asUInt . orR
io . needReExecute : = needReExecute
2021-08-20 18:17:28 +08:00
// feedback tlb result to RS
2022-12-02 22:35:02 +08:00
io . rsFeedback . valid : = false . B
2021-12-22 16:54:40 +08:00
val s2_need_replay_from_rs = Wire ( Bool ( ) )
2021-12-20 15:32:19 +08:00
if ( EnableFastForward ) {
2021-12-22 16:54:40 +08:00
s2_need_replay_from_rs : =
2022-12-08 22:05:29 +08:00
needReExecute ||
2021-12-22 16:54:40 +08:00
s2_tlb_miss || // replay if dtlb miss
2022-11-18 14:52:30 +08:00
s2_cache_replay && ! s2_is_prefetch && ! s2_mmio && ! s2_exception && ! fullForward || // replay if dcache miss queue full / busy
s2_data_invalid && ! s2_is_prefetch // replay if store to load forward data is not ready
2021-12-20 15:32:19 +08:00
} else {
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656)
each tlb's port can be configured to be block or non-blocked.
For blocked port, there will be a req miss slot stored in tlb, but belong to
core pipeline, which means only core pipeline flush will invalid them.
For another, itlb also use PTW Filter but with only 4 entries.
Last, keep svinval extension as usual, still work.
* tlb: add blocked-tlb support, miss frontend changes
* tlb: remove tlb's sameCycle support, result will return at next cycle
* tlb: remove param ShouldBlock, move block method into TLB module
* tlb: fix handle_block's miss_req logic
* mmu.filter: change filter's req.ready to canEnqueue
when filter can't let all the req enqueue, set the req.ready to false.
canEnqueue after filtering has long latency, so we use **_fake
without filtering, but the filter will still receive the reqs if
it can(after filtering).
* mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO
* mmu: replace itlb's repeater to filter&repeaternb
* mmu.tlb: add TlbStorageWrapper to make TLB cleaner
more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it
* mmu.tlb: rm unused param in function r_req_apply, fix syntax bug
* [WIP]icache: itlb usage from non-blocked to blocked
* mmu.tlb: change parameter NBWidth to Seq of boolean
* icache.mainpipe: fix itlb's resp.ready, not always true
* mmu.tlb: add kill sigal to blocked req that needs sync but fail
in frontend, icache,itlb,next pipe may not able to sync.
blocked tlb will store miss req ang blocks req, which makes itlb
couldn't work. So add kill logic to let itlb not to store reqs.
One more thing: fix icache's blocked tlb handling logic
* icache.mainpipe: fix tlb's ready_recv logic
icache mainpipe has two ports, but these two ports may not valid
all the same time. So add new signals tlb_need_recv to record whether
stage s1 should wait for the tlb.
* tlb: when flush, just set resp.valid and pf, pf for don't use it
* tlb: flush should concern satp.changed(for blocked io now)
* mmu.tlb: add new flush that doesn't flush reqs
Sfence.vma will flush inflight reqs and flushPipe
But some other sfence(svinval...) will not. So add new flush to
distinguish these two kinds of sfence signal
morw: forget to assign resp result when ptw back, fix it
* mmu.tlb: beautify miss_req_v and miss_v relative logic
* mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN
bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB)
when genPPN.
by the way: some funtions need ": Unit = ", add it.
* mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req
* icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back
Icache's mainpipe has two ports, but may only port 0 is valid.
When a port is invalid, the tlbexcp should be false.(Actually, should
be ignored).
So & tlb_need_back to fix this bug.
* sfence: instr in svinval ext will also flush pipe
A difficult problem to handle:
Sfence and Svinval will flush MMU, but only Sfence(some svinval)
will flush pipe. For itlb that some requestors are blocked and
icache doesn't recv flush for simplicity, itlb's blocked ptw req
should not be flushed.
It's a huge problem for MMU to handle for good or bad solutions. But
svinval is seldom used, so disable it's effiency.
* mmu: add parameter to control mmu's sfence delay latency
Difficult problem:
itlb's blocked req should not be abandoned, but sfence will flush
all infight reqs. when itlb and itlb repeater's delay is not same(itlb
is flushed, two cycles later, itlb repeater is flushed, then itlb's
ptw req after flushing will be also flushed sliently.
So add one parameter to control the flush delay to be the same.
* mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire
1. csr.priv's delay
csr.priv should not be delayed, csr.satp should be delayed.
for excep/intr will change csr.priv, which will be changed at one
instruction's (commit?). but csrrw satp will not, so satp has more
cycles to delay.
2. sfence
when sfence valid but blocked req fire, resp should still fire.
3. satp in TlbCsrBundle
let high bits of satp.ppn to be 0.U
* tlb&icache.mainpipe: rm commented codes
* mmu: move method genPPN to entry bundle
* l1tlb: divide l1tlb flush into flush_mmu and flush_pipe
Problem:
For l1tlb, there are blocked and non-blocked req ports.
For blocked ports, there are req slots to store missed reqs.
Some mmu flush like Sfence should not flush miss slots for outside
may still need get tlb resp, no matter wrong and correct resp.
For example. sfence will flush mmu and flush pipe, but won't flush
reqs inside icache, which waiting for tlb resp.
For example, svinval instr will flush mmu, but not flush pipe. so
tlb should return correct resp, althrough the ptw req is flushed
when tlb miss.
Solution:
divide l1tlb flush into flush_mmu and flush_pipe.
The req slot is considered to be a part of core pipeline and should
only be flushed by flush_pipe.
flush_mmu will flush mmu entries and inflight ptw reqs.
When miss but sfence flushed its ptw req, re-send.
* l1tlb: code clean, correct comments and rm unused codes
* l2tlb: divide filterSize into ifiterSize and dfilterSize
* l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue
* l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
// Note that if all parts of data are available in sq / sbuffer, replay required by dcache will not be scheduled
s2_need_replay_from_rs : =
2022-12-08 22:05:29 +08:00
needReExecute ||
2021-12-22 16:54:40 +08:00
s2_tlb_miss || // replay if dtlb miss
2022-11-18 14:52:30 +08:00
s2_cache_replay && ! s2_is_prefetch && ! s2_mmio && ! s2_exception && ! io . dataForwarded || // replay if dcache miss queue full / busy
s2_data_invalid && ! s2_is_prefetch // replay if store to load forward data is not ready
2021-10-10 11:51:53 +08:00
}
2021-12-22 16:54:40 +08:00
io . rsFeedback . bits . hit : = ! s2_need_replay_from_rs
2021-08-20 18:17:28 +08:00
io . rsFeedback . bits . rsIdx : = io . in . bits . rsIdx
io . rsFeedback . bits . flushState : = io . in . bits . ptwBack
2022-04-02 19:08:46 +08:00
// feedback source priority: tlbMiss > dataInvalid > mshrFull
// general case priority: tlbMiss > exception (include forward_fail / ldld_violation) > mmio > dataInvalid > mshrFull > normal miss / hit
2021-08-20 18:17:28 +08:00
io . rsFeedback . bits . sourceType : = Mux ( s2_tlb_miss , RSFeedbackType . tlbMiss ,
2022-04-02 19:08:46 +08:00
Mux ( s2_data_invalid ,
RSFeedbackType . dataInvalid ,
RSFeedbackType . mshrFull
2021-08-20 18:17:28 +08:00
)
)
2021-10-12 19:51:51 +08:00
io . rsFeedback . bits . dataInvalidSqIdx . value : = io . dataInvalidSqIdx
io . rsFeedback . bits . dataInvalidSqIdx . flag : = DontCare
2021-08-20 18:17:28 +08:00
2022-12-02 22:35:02 +08:00
io . replaySlow . valid : = io . in . valid
io . replaySlow . tlb_hited : = ! s2_tlb_miss
2022-12-08 22:05:29 +08:00
io . replaySlow . st_ld_check_ok : = ! needReExecute
2022-12-02 22:35:02 +08:00
if ( EnableFastForward ) {
io . replaySlow . cache_no_replay : = ! s2_cache_replay || s2_is_prefetch || s2_mmio || s2_exception || fullForward
} else {
io . replaySlow . cache_no_replay : = ! s2_cache_replay || s2_is_prefetch || s2_mmio || s2_exception || io . dataForwarded
}
io . replaySlow . forward_data_valid : = ! s2_data_invalid || s2_is_prefetch
io . replaySlow . ld_idx : = io . in . bits . uop . lqIdx . value
io . replaySlow . data_invalid_sq_idx : = io . dataInvalidSqIdx
2021-08-20 18:17:28 +08:00
// s2_cache_replay is quite slow to generate, send it separately to LQ
2021-11-29 11:34:37 +08:00
if ( EnableFastForward ) {
2022-11-18 14:52:30 +08:00
io . s2_dcache_require_replay : = s2_cache_replay && ! fullForward
2021-11-29 11:34:37 +08:00
} else {
2022-11-18 14:52:30 +08:00
io . s2_dcache_require_replay : = s2_cache_replay &&
2022-12-02 22:35:02 +08:00
s2_need_replay_from_rs &&
2021-12-22 16:54:40 +08:00
! io . dataForwarded &&
! s2_is_prefetch &&
io . out . bits . miss
2021-11-29 11:34:37 +08:00
}
2021-08-20 18:17:28 +08:00
2021-09-22 15:48:08 +08:00
XSPerfAccumulate ( "in_valid" , io . in . valid )
XSPerfAccumulate ( "in_fire" , io . in . fire )
XSPerfAccumulate ( "in_fire_first_issue" , io . in . fire && io . in . bits . isFirstIssue )
XSPerfAccumulate ( "dcache_miss" , io . in . fire && s2_cache_miss )
XSPerfAccumulate ( "dcache_miss_first_issue" , io . in . fire && s2_cache_miss && io . in . bits . isFirstIssue )
2021-03-25 21:08:52 +08:00
XSPerfAccumulate ( "full_forward" , io . in . valid && fullForward )
XSPerfAccumulate ( "dcache_miss_full_forward" , io . in . valid && s2_cache_miss && fullForward )
2021-04-30 10:40:51 +08:00
XSPerfAccumulate ( "replay" , io . rsFeedback . valid && ! io . rsFeedback . bits . hit )
XSPerfAccumulate ( "replay_tlb_miss" , io . rsFeedback . valid && ! io . rsFeedback . bits . hit && s2_tlb_miss )
XSPerfAccumulate ( "replay_cache" , io . rsFeedback . valid && ! io . rsFeedback . bits . hit && ! s2_tlb_miss && s2_cache_replay )
2021-03-25 21:08:52 +08:00
XSPerfAccumulate ( "stall_out" , io . out . valid && ! io . out . ready )
2022-11-18 14:52:30 +08:00
XSPerfAccumulate ( "replay_from_fetch_forward" , io . out . valid && debug_forwardFailReplay )
XSPerfAccumulate ( "replay_from_fetch_load_vio" , io . out . valid && debug_ldldVioReplay )
2022-12-02 22:35:02 +08:00
XSPerfAccumulate ( "replay_lq" , io . replaySlow . valid && ( ! io . replaySlow . tlb_hited || ! io . replaySlow . cache_no_replay || ! io . replaySlow . forward_data_valid ) )
XSPerfAccumulate ( "replay_tlb_miss_lq" , io . replaySlow . valid && ! io . replaySlow . tlb_hited )
2022-12-08 22:05:29 +08:00
XSPerfAccumulate ( "replay_sl_vio" , io . replaySlow . valid && io . replaySlow . tlb_hited && ! io . replaySlow . st_ld_check_ok )
XSPerfAccumulate ( "replay_cache_lq" , io . replaySlow . valid && io . replaySlow . tlb_hited && io . replaySlow . st_ld_check_ok && ! io . replaySlow . cache_no_replay )
2020-10-17 21:05:46 +08:00
}
2020-08-06 16:58:13 +08:00
2022-08-22 19:02:28 +08:00
class LoadUnit ( implicit p : Parameters ) extends XSModule
2022-04-02 19:08:46 +08:00
with HasLoadHelper
with HasPerfEvents
with HasDCacheParameters
{
2020-10-17 21:05:46 +08:00
val io = IO ( new Bundle ( ) {
val ldin = Flipped ( Decoupled ( new ExuInput ) )
val ldout = Decoupled ( new ExuOutput )
val redirect = Flipped ( ValidIO ( new Redirect ) )
2021-10-11 16:01:26 +08:00
val feedbackSlow = ValidIO ( new RSFeedback )
val feedbackFast = ValidIO ( new RSFeedback )
2021-01-30 17:31:42 +08:00
val rsIdx = Input ( UInt ( log2Up ( IssQueSize ) . W ) )
2021-03-13 08:49:36 +08:00
val isFirstIssue = Input ( Bool ( ) )
2020-10-25 13:24:10 +08:00
val dcache = new DCacheLoadIO
2020-10-17 21:05:46 +08:00
val sbuffer = new LoadForwardQueryIO
2020-11-18 20:47:14 +08:00
val lsq = new LoadToLsqIO
2022-04-02 19:08:46 +08:00
val refill = Flipped ( ValidIO ( new Refill ) )
2022-07-12 22:22:01 +08:00
val fastUop = ValidIO ( new MicroOp ) // early wakeup signal generated in load_s1, send to RS in load_s2
2021-12-01 18:43:36 +08:00
val trigger = Vec ( 3 , new LoadUnitTriggerIO )
l0tlb: add a new level tlb, a load tlb and a store tlb (#961)
* Revert "Revert "l0tlb: add a new level tlb to each mem pipeline (#936)" (#945)"
This reverts commit b052b97230d6fdeedaf4e4905092adef6e768b4f.
* fu: remove unused import
* mmu.tlb: 2 load/store pipeline has 1 dtlb
* mmu: remove btlb, the l1-tlb
* mmu: set split-tlb to 32 to check perf effect
* mmu: wrap tlb's param with TLBParameters
* mmu: add params 'useBTlb'
dtlb size is small: normal 8, super 2
* mmu.tlb: add Bundle TlbEntry, simplify tlb hit logic(coding)
* mmu.tlb: seperate tlb's storage, relative hit/sfence logic
tlb now supports full-associate, set-associate, directive-associate.
more: change tlb's parameter usage, change util.Random to support
case that mod is 1.
* mmu.tlb: support normalAsVictim, super(fa) -> normal(sa/da)
be carefull to use tlb's parameter, only a part of param combination
is supported
* mmu.tlb: fix bug of hit method and victim write
* mmu.tlb: add tlb storage's perf counter
* mmu.tlb: rewrite replace part, support set or non-set
* mmu.tlb: add param outReplace to receive out replace index
* mmu.tlb: change param superSize to superNWays
add param superNSets, which should always be 1
* mmu.tlb: change some perf counter's name and change some params
* mmu.tlb: fix bug of replace io bundle
* mmu.tlb: remove unused signal wayIdx in tlbstorageio
* mmu.tlb: separate tlb_ld/st into two 'same' tlb
* mmu.tlb: when nWays is 1, replace returns 0.U
before, replace will return 1.U, no influence for refill but bad
for perf counter
* mmu.tlb: give tlb_ld and tlb_st a name (in waveform)
2021-09-02 22:53:18 +08:00
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
val tlb = new TlbRequestIO ( 2 )
pma: add pmp-like pma, software can read and write (#1169)
remove the old hard-wired pma and turn to pmp-like csr registers. the pma config is writen in pma register.
1. pma are m-priv csr, so only m-mode csrrw can change pma
2. even in m-mode, pma should be always checked, no matter lock or not
3. so carefully write pma, make sure not to "suicide"
* pma: add pmp-like pma, just module/bundle added, not to circuit
use reserved 2 bits as atomic and cached
* pma: add pmp-like pma into pmp module
pma have two more attribute than pmp
1. atmoic;
2. c/cache, if false, go to mmio.
pma uses 16+4 machine-level custom ready write csr.
pma will always be checked even in m-mode.
* pma: remove the old MemMap in tlb, mmio arrives next cycle
* pma: ptw raise af when mmio
* pma: fix bug of match's zip with last entry
* pma: fix bug of pass reset signal through method's parameter
strange bug, want to reset, pass reset signal to a method, does not
work.
import chisel3.Module.reset, the method can access reset it's self.
* pma: move some method to trait and fix bug of pma_init value
* pma: fix bug of pma init value assign way
* tlb: fix stupid bug that pf.ld not & fault_valid
* loadunit: fix bug that uop is flushed, pmp's dcache kill failed also
* ifu: mmio access needs f2_valid now
* loadunit: if mmio and have sent fastUop, flush pipe when commit
* storeunit: stu->lsq at stage1 and re-in lsq at stage2 to update mmio
2021-10-25 20:16:15 +08:00
val pmp = Flipped ( new PMPRespBundle ( ) ) // arrive same to tlb now
2021-10-11 21:56:10 +08:00
2021-08-20 22:39:07 +08:00
val fastpathOut = Output ( new LoadToLoadIO )
2022-08-22 19:02:28 +08:00
val fastpathIn = Input ( new LoadToLoadIO )
val loadFastMatch = Input ( Bool ( ) )
2022-09-01 15:21:55 +08:00
val loadFastImm = Input ( UInt ( 12. W ) )
2021-10-22 15:29:07 +08:00
2022-10-13 15:57:25 +08:00
val s3_delayed_load_error = Output ( Bool ( ) ) // load ecc error
// Note that io.s3_delayed_load_error and io.lsq.s3_delayed_load_error is different
2022-06-28 13:47:21 +08:00
2021-10-22 15:29:07 +08:00
val csrCtrl = Flipped ( new CustomCSRCtrlIO )
2022-12-08 22:05:29 +08:00
val reExecuteQuery = Flipped ( Vec ( StorePipelineWidth , Valid ( new LoadReExecuteQueryIO ) ) ) // load replay
2022-12-02 22:35:02 +08:00
val lsqOut = Flipped ( Decoupled ( new LsPipelineBundle ) )
2020-10-17 21:05:46 +08:00
} )
val load_s0 = Module ( new LoadUnit_S0 )
val load_s1 = Module ( new LoadUnit_S1 )
val load_s2 = Module ( new LoadUnit_S2 )
2022-12-02 22:35:02 +08:00
load_s0 . io . lsqOut <> io . lsqOut
2022-11-18 14:52:30 +08:00
// load s0
2020-10-17 21:05:46 +08:00
load_s0 . io . in <> io . ldin
l0tlb: add a new level tlb, a load tlb and a store tlb (#961)
* Revert "Revert "l0tlb: add a new level tlb to each mem pipeline (#936)" (#945)"
This reverts commit b052b97230d6fdeedaf4e4905092adef6e768b4f.
* fu: remove unused import
* mmu.tlb: 2 load/store pipeline has 1 dtlb
* mmu: remove btlb, the l1-tlb
* mmu: set split-tlb to 32 to check perf effect
* mmu: wrap tlb's param with TLBParameters
* mmu: add params 'useBTlb'
dtlb size is small: normal 8, super 2
* mmu.tlb: add Bundle TlbEntry, simplify tlb hit logic(coding)
* mmu.tlb: seperate tlb's storage, relative hit/sfence logic
tlb now supports full-associate, set-associate, directive-associate.
more: change tlb's parameter usage, change util.Random to support
case that mod is 1.
* mmu.tlb: support normalAsVictim, super(fa) -> normal(sa/da)
be carefull to use tlb's parameter, only a part of param combination
is supported
* mmu.tlb: fix bug of hit method and victim write
* mmu.tlb: add tlb storage's perf counter
* mmu.tlb: rewrite replace part, support set or non-set
* mmu.tlb: add param outReplace to receive out replace index
* mmu.tlb: change param superSize to superNWays
add param superNSets, which should always be 1
* mmu.tlb: change some perf counter's name and change some params
* mmu.tlb: fix bug of replace io bundle
* mmu.tlb: remove unused signal wayIdx in tlbstorageio
* mmu.tlb: separate tlb_ld/st into two 'same' tlb
* mmu.tlb: when nWays is 1, replace returns 0.U
before, replace will return 1.U, no influence for refill but bad
for perf counter
* mmu.tlb: give tlb_ld and tlb_st a name (in waveform)
2021-09-02 22:53:18 +08:00
load_s0 . io . dtlbReq <> io . tlb . req
2020-10-25 13:24:10 +08:00
load_s0 . io . dcacheReq <> io . dcache . req
2021-01-30 17:31:42 +08:00
load_s0 . io . rsIdx : = io . rsIdx
2021-03-13 08:49:36 +08:00
load_s0 . io . isFirstIssue : = io . isFirstIssue
2022-08-22 19:02:28 +08:00
load_s0 . io . s0_kill : = false . B
2022-12-02 22:35:02 +08:00
// we try pointerchasing when (1. no rs-issued load and 2. no LSQ replayed load)
val s0_tryPointerChasing = ! io . ldin . valid && ! io . lsqOut . valid && io . fastpathIn . valid
2022-09-01 15:21:55 +08:00
val s0_pointerChasingVAddr = io . fastpathIn . data ( 5 , 0 ) +& io . loadFastImm ( 5 , 0 )
2022-09-22 08:56:44 +08:00
load_s0 . io . fastpath . valid : = io . fastpathIn . valid
load_s0 . io . fastpath . data : = Cat ( io . fastpathIn . data ( XLEN - 1 , 6 ) , s0_pointerChasingVAddr ( 5 , 0 ) )
2020-10-17 21:05:46 +08:00
2022-08-29 09:38:58 +08:00
val s1_data = PipelineConnect ( load_s0 . io . out , load_s1 . io . in , true . B ,
load_s0 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) && ! s0_tryPointerChasing ) . get
2020-10-17 21:05:46 +08:00
2022-11-18 14:52:30 +08:00
// load s1
2022-12-02 22:35:02 +08:00
// update s1_kill when any source has valid request
load_s1 . io . s1_kill : = RegEnable ( load_s0 . io . s0_kill , false . B , io . ldin . valid || io . lsqOut . valid || io . fastpathIn . valid )
2022-11-05 10:32:20 +08:00
io . tlb . req_kill : = load_s1 . io . s1_kill
l0tlb: add a new level tlb, a load tlb and a store tlb (#961)
* Revert "Revert "l0tlb: add a new level tlb to each mem pipeline (#936)" (#945)"
This reverts commit b052b97230d6fdeedaf4e4905092adef6e768b4f.
* fu: remove unused import
* mmu.tlb: 2 load/store pipeline has 1 dtlb
* mmu: remove btlb, the l1-tlb
* mmu: set split-tlb to 32 to check perf effect
* mmu: wrap tlb's param with TLBParameters
* mmu: add params 'useBTlb'
dtlb size is small: normal 8, super 2
* mmu.tlb: add Bundle TlbEntry, simplify tlb hit logic(coding)
* mmu.tlb: seperate tlb's storage, relative hit/sfence logic
tlb now supports full-associate, set-associate, directive-associate.
more: change tlb's parameter usage, change util.Random to support
case that mod is 1.
* mmu.tlb: support normalAsVictim, super(fa) -> normal(sa/da)
be carefull to use tlb's parameter, only a part of param combination
is supported
* mmu.tlb: fix bug of hit method and victim write
* mmu.tlb: add tlb storage's perf counter
* mmu.tlb: rewrite replace part, support set or non-set
* mmu.tlb: add param outReplace to receive out replace index
* mmu.tlb: change param superSize to superNWays
add param superNSets, which should always be 1
* mmu.tlb: change some perf counter's name and change some params
* mmu.tlb: fix bug of replace io bundle
* mmu.tlb: remove unused signal wayIdx in tlbstorageio
* mmu.tlb: separate tlb_ld/st into two 'same' tlb
* mmu.tlb: when nWays is 1, replace returns 0.U
before, replace will return 1.U, no influence for refill but bad
for perf counter
* mmu.tlb: give tlb_ld and tlb_st a name (in waveform)
2021-09-02 22:53:18 +08:00
load_s1 . io . dtlbResp <> io . tlb . resp
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io . dcache . s1_paddr_dup_lsu <> load_s1 . io . lsuPAddr
io . dcache . s1_paddr_dup_dcache <> load_s1 . io . dcachePAddr
2022-08-22 19:02:28 +08:00
io . dcache . s1_kill : = load_s1 . io . dcacheKill
2020-12-11 19:59:25 +08:00
load_s1 . io . sbuffer <> io . sbuffer
load_s1 . io . lsq <> io . lsq . forward
2021-10-22 15:29:07 +08:00
load_s1 . io . loadViolationQueryReq <> io . lsq . loadViolationQuery . req
2021-10-11 16:01:26 +08:00
load_s1 . io . dcacheBankConflict <> io . dcache . s1_bank_conflict
2021-10-22 15:29:07 +08:00
load_s1 . io . csrCtrl <> io . csrCtrl
2022-12-08 22:05:29 +08:00
load_s1 . io . reExecuteQuery : = io . reExecuteQuery
2022-12-02 22:35:02 +08:00
// provide paddr and vaddr for lq
io . lsq . loadPaddrIn . valid : = load_s1 . io . out . valid
io . lsq . loadPaddrIn . bits . lqIdx : = load_s1 . io . out . bits . uop . lqIdx
io . lsq . loadPaddrIn . bits . paddr : = load_s1 . io . lsuPAddr
io . lsq . loadVaddrIn . valid : = load_s1 . io . in . valid && ! load_s1 . io . s1_kill
io . lsq . loadVaddrIn . bits . lqIdx : = load_s1 . io . out . bits . uop . lqIdx
io . lsq . loadVaddrIn . bits . vaddr : = load_s1 . io . out . bits . vaddr
// when S0 has opportunity to try pointerchasing, make sure it truely goes to S1
// which is S0's out is ready and dcache is ready
val s0_doTryPointerChasing = s0_tryPointerChasing && load_s0 . io . out . ready && load_s0 . io . dcacheReq . ready
2022-09-01 15:21:55 +08:00
val s1_tryPointerChasing = RegNext ( s0_doTryPointerChasing , false . B )
val s1_pointerChasingVAddr = RegEnable ( s0_pointerChasingVAddr , s0_doTryPointerChasing )
2022-08-22 19:02:28 +08:00
val cancelPointerChasing = WireInit ( false . B )
if ( EnableLoadToLoadForward ) {
// Sometimes, we need to cancel the load-load forwarding.
// These can be put at S0 if timing is bad at S1.
// Case 0: CACHE_SET(base + offset) != CACHE_SET(base) (lowest 6-bit addition has an overflow)
2022-09-01 15:21:55 +08:00
val addressMisMatch = s1_pointerChasingVAddr ( 6 ) || RegEnable ( io . loadFastImm ( 11 , 6 ) . orR , s0_doTryPointerChasing )
2022-08-22 19:02:28 +08:00
// Case 1: the address is not 64-bit aligned or the fuOpType is not LD
2022-09-01 15:21:55 +08:00
val addressNotAligned = s1_pointerChasingVAddr ( 2 , 0 ) . orR
2022-08-22 19:02:28 +08:00
val fuOpTypeIsNotLd = io . ldin . bits . uop . ctrl . fuOpType =/= LSUOpType . ld
// Case 2: this is not a valid load-load pair
val notFastMatch = RegEnable ( ! io . loadFastMatch , s0_tryPointerChasing )
// Case 3: this load-load uop is cancelled
val isCancelled = ! io . ldin . valid
when ( s1_tryPointerChasing ) {
cancelPointerChasing : = addressMisMatch || addressNotAligned || fuOpTypeIsNotLd || notFastMatch || isCancelled
load_s1 . io . in . bits . uop : = io . ldin . bits . uop
2022-08-29 09:38:58 +08:00
val spec_vaddr = s1_data . vaddr
2022-09-01 15:21:55 +08:00
val vaddr = Cat ( spec_vaddr ( VAddrBits - 1 , 6 ) , s1_pointerChasingVAddr ( 5 , 3 ) , 0. U ( 3. W ) )
2022-08-29 09:38:58 +08:00
load_s1 . io . in . bits . vaddr : = vaddr
2022-08-22 19:02:28 +08:00
load_s1 . io . in . bits . rsIdx : = io . rsIdx
load_s1 . io . in . bits . isFirstIssue : = io . isFirstIssue
// We need to replace vaddr(5, 3).
2022-09-01 15:21:55 +08:00
val spec_paddr = io . tlb . resp . bits . paddr ( 0 )
load_s1 . io . dtlbResp . bits . paddr . foreach ( _ : = Cat ( spec_paddr ( PAddrBits - 1 , 6 ) , s1_pointerChasingVAddr ( 5 , 3 ) , 0. U ( 3. W ) ) )
2022-08-22 19:02:28 +08:00
}
when ( cancelPointerChasing ) {
load_s1 . io . s1_kill : = true . B
} . otherwise {
load_s0 . io . s0_kill : = s1_tryPointerChasing
when ( s1_tryPointerChasing ) {
io . ldin . ready : = true . B
}
}
XSPerfAccumulate ( "load_to_load_forward" , s1_tryPointerChasing && ! cancelPointerChasing )
XSPerfAccumulate ( "load_to_load_forward_try" , s1_tryPointerChasing )
XSPerfAccumulate ( "load_to_load_forward_fail" , cancelPointerChasing )
XSPerfAccumulate ( "load_to_load_forward_fail_cancelled" , cancelPointerChasing && isCancelled )
XSPerfAccumulate ( "load_to_load_forward_fail_wakeup_mismatch" , cancelPointerChasing && ! isCancelled && notFastMatch )
XSPerfAccumulate ( "load_to_load_forward_fail_op_not_ld" ,
cancelPointerChasing && ! isCancelled && ! notFastMatch && fuOpTypeIsNotLd )
XSPerfAccumulate ( "load_to_load_forward_fail_addr_align" ,
cancelPointerChasing && ! isCancelled && ! notFastMatch && ! fuOpTypeIsNotLd && addressNotAligned )
XSPerfAccumulate ( "load_to_load_forward_fail_set_mismatch" ,
cancelPointerChasing && ! isCancelled && ! notFastMatch && ! fuOpTypeIsNotLd && ! addressNotAligned && addressMisMatch )
}
PipelineConnect ( load_s1 . io . out , load_s2 . io . in , true . B ,
load_s1 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) || cancelPointerChasing )
2020-10-17 21:05:46 +08:00
2022-08-09 19:18:07 +08:00
2022-11-18 14:52:30 +08:00
// load s2
2021-12-20 15:32:19 +08:00
io . dcache . s2_kill : = load_s2 . io . dcache_kill // to kill mmio resp which are redirected
2020-10-25 13:24:10 +08:00
load_s2 . io . dcacheResp <> io . dcache . resp
2021-10-11 21:56:10 +08:00
load_s2 . io . pmpResp <> io . pmp
2021-12-20 15:32:19 +08:00
load_s2 . io . static_pm : = RegNext ( io . tlb . resp . bits . static_pm )
2020-12-13 21:31:00 +08:00
load_s2 . io . lsq . forwardData <> io . lsq . forward . forwardData
load_s2 . io . lsq . forwardMask <> io . lsq . forward . forwardMask
2021-08-20 01:27:12 +08:00
load_s2 . io . lsq . forwardMaskFast <> io . lsq . forward . forwardMaskFast // should not be used in load_s2
2021-04-30 10:40:51 +08:00
load_s2 . io . lsq . dataInvalid <> io . lsq . forward . dataInvalid
2021-08-02 19:42:28 +08:00
load_s2 . io . lsq . matchInvalid <> io . lsq . forward . matchInvalid
2020-12-16 14:44:10 +08:00
load_s2 . io . sbuffer . forwardData <> io . sbuffer . forwardData
load_s2 . io . sbuffer . forwardMask <> io . sbuffer . forwardMask
2021-08-20 01:27:12 +08:00
load_s2 . io . sbuffer . forwardMaskFast <> io . sbuffer . forwardMaskFast // should not be used in load_s2
2021-04-30 10:40:51 +08:00
load_s2 . io . sbuffer . dataInvalid <> io . sbuffer . dataInvalid // always false
2021-08-02 19:42:28 +08:00
load_s2 . io . sbuffer . matchInvalid <> io . sbuffer . matchInvalid
2022-10-13 15:57:25 +08:00
load_s2 . io . dataForwarded <> io . lsq . s2_load_data_forwarded
2021-10-12 19:51:51 +08:00
load_s2 . io . dataInvalidSqIdx : = io . lsq . forward . dataInvalidSqIdx // provide dataInvalidSqIdx to make wakeup faster
2021-10-22 15:29:07 +08:00
load_s2 . io . loadViolationQueryResp <> io . lsq . loadViolationQuery . resp
load_s2 . io . csrCtrl <> io . csrCtrl
2022-07-12 22:22:01 +08:00
load_s2 . io . sentFastUop : = io . fastUop . valid
2022-12-08 22:05:29 +08:00
load_s2 . io . reExecuteQuery : = io . reExecuteQuery
2022-07-01 21:02:50 +08:00
// feedback bank conflict / ld-vio check struct hazard to rs
io . feedbackFast . bits : = RegNext ( load_s1 . io . rsFeedback . bits )
io . feedbackFast . valid : = RegNext ( load_s1 . io . rsFeedback . valid && ! load_s1 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) )
2021-10-11 16:01:26 +08:00
2021-02-02 00:17:52 +08:00
// pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding
2022-12-02 22:35:02 +08:00
val sqIdxMaskReg = RegNext ( UIntToMask ( load_s0 . io . s0_sqIdx . value , StoreQueueSize ) )
2022-08-22 19:02:28 +08:00
// to enable load-load, sqIdxMask must be calculated based on ldin.uop
// If the timing here is not OK, load-load forwarding has to be disabled.
// Or we calculate sqIdxMask at RS??
2021-02-02 00:17:52 +08:00
io . lsq . forward . sqIdxMask : = sqIdxMaskReg
2022-08-22 19:02:28 +08:00
if ( EnableLoadToLoadForward ) {
when ( s1_tryPointerChasing ) {
io . lsq . forward . sqIdxMask : = UIntToMask ( io . ldin . bits . uop . sqIdx . value , StoreQueueSize )
}
}
2020-10-17 21:05:46 +08:00
2021-03-10 08:26:40 +08:00
// // use s2_hit_way to select data received in s1
// load_s2.io.dcacheResp.bits.data := Mux1H(RegNext(io.dcache.s1_hit_way), RegNext(io.dcache.s1_data))
// assert(load_s2.io.dcacheResp.bits.data === io.dcache.resp.bits.data)
2022-07-12 22:22:01 +08:00
// now io.fastUop.valid is sent to RS in load_s2
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
val s2_dcache_hit = io . dcache . s2_hit // dcache hit dup in lsu side
2022-07-12 22:22:01 +08:00
io . fastUop . valid : = RegNext (
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
! io . dcache . s1_disable_fast_wakeup && // load fast wakeup should be disabled when dcache data read is not ready
load_s1 . io . in . valid && // valid load request
! load_s1 . io . s1_kill && // killed by load-load forwarding
! load_s1 . io . dtlbResp . bits . fast_miss && // not mmio or tlb miss, pf / af not included here
! io . lsq . forward . dataInvalidFast // forward failed
) &&
! RegNext ( load_s1 . io . needLdVioCheckRedo ) && // load-load violation check: load paddr cam struct hazard
2022-12-08 22:05:29 +08:00
! RegNext ( load_s1 . io . needReExecute ) &&
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
! RegNext ( load_s1 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) ) &&
2022-12-08 22:05:29 +08:00
( load_s2 . io . in . valid && ! load_s2 . io . needReExecute && s2_dcache_hit ) // dcache hit in lsu side
Sync timing modification of #1681 and #1793 (#1793)
* ldu: optimize dcache hitvec wiring
In previous design, hitvec is generated in load s1, then send to dcache
and lsu (rs) side separately. As dcache and lsu (rs side) is far in real
chip, it caused severe wiring problem.
Now we generate 2 hitvec in parallel:
* hitvec 1 is generated near dcache.
To generate that signal, paddr from dtlb is sent to dcache in load_s1
to geerate hitvec. The hitvec is then sent to dcache to generate
data array read_way_en.
* hitvec 2 is generated near lsu and rs in load_s2, tag read result
from dcache, as well as coh_state, is sent to lsu in load_s1,
then it is used to calcuate hitvec in load_s2. hitvec 2 is used
to generate hit/miss signal used by lsu.
It should fix the wiring problem caused by hitvec
* ldu: opt loadViolationQuery.resp.ready timing
An extra release addr register is added near lsu to speed up the
generation of loadViolationQuery.resp.ready
* l1tlb: replace NormalPage data module and add duplicate resp result
data module:
add BankedSyncDataMoudleWithDup data module:
divided the data array into banks and read as Async, bypass write data.
RegNext the data result * #banks. choose from the chosen data.
duplicate:
duplicate the chosen data and return to outside(tlb).
tlb return (ppn+perm) * #DUP to outside (for load unit only)
TODO: load unit use different tlb resp result to different module.
one for lsq, one for dcache.
* l1tlb: Fix wrong vidx_bypass logic after using duplicate data module
We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate,
whose write ports are not Vec.
Co-authored-by: William Wang <zeweiwang@outlook.com>
Co-authored-by: ZhangZifei <1773908404@qq.com>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
2022-07-12 22:22:01 +08:00
io . fastUop . bits : = RegNext ( load_s1 . io . out . bits . uop )
2021-03-10 08:26:40 +08:00
2020-10-17 21:05:46 +08:00
XSDebug ( load_s0 . io . out . valid ,
2020-10-31 23:12:13 +08:00
p "S0: pc ${Hexadecimal(load_s0.io.out.bits.uop.cf.pc)}, lId ${Hexadecimal(load_s0.io.out.bits.uop.lqIdx.asUInt)}, " +
2020-10-17 21:05:46 +08:00
p "vaddr ${Hexadecimal(load_s0.io.out.bits.vaddr)}, mask ${Hexadecimal(load_s0.io.out.bits.mask)}\n" )
2020-11-18 20:47:14 +08:00
XSDebug ( load_s1 . io . out . valid ,
l0tlb: add a new level tlb, a load tlb and a store tlb (#961)
* Revert "Revert "l0tlb: add a new level tlb to each mem pipeline (#936)" (#945)"
This reverts commit b052b97230d6fdeedaf4e4905092adef6e768b4f.
* fu: remove unused import
* mmu.tlb: 2 load/store pipeline has 1 dtlb
* mmu: remove btlb, the l1-tlb
* mmu: set split-tlb to 32 to check perf effect
* mmu: wrap tlb's param with TLBParameters
* mmu: add params 'useBTlb'
dtlb size is small: normal 8, super 2
* mmu.tlb: add Bundle TlbEntry, simplify tlb hit logic(coding)
* mmu.tlb: seperate tlb's storage, relative hit/sfence logic
tlb now supports full-associate, set-associate, directive-associate.
more: change tlb's parameter usage, change util.Random to support
case that mod is 1.
* mmu.tlb: support normalAsVictim, super(fa) -> normal(sa/da)
be carefull to use tlb's parameter, only a part of param combination
is supported
* mmu.tlb: fix bug of hit method and victim write
* mmu.tlb: add tlb storage's perf counter
* mmu.tlb: rewrite replace part, support set or non-set
* mmu.tlb: add param outReplace to receive out replace index
* mmu.tlb: change param superSize to superNWays
add param superNSets, which should always be 1
* mmu.tlb: change some perf counter's name and change some params
* mmu.tlb: fix bug of replace io bundle
* mmu.tlb: remove unused signal wayIdx in tlbstorageio
* mmu.tlb: separate tlb_ld/st into two 'same' tlb
* mmu.tlb: when nWays is 1, replace returns 0.U
before, replace will return 1.U, no influence for refill but bad
for perf counter
* mmu.tlb: give tlb_ld and tlb_st a name (in waveform)
2021-09-02 22:53:18 +08:00
p "S1: pc ${Hexadecimal(load_s1.io.out.bits.uop.cf.pc)}, lId ${Hexadecimal(load_s1.io.out.bits.uop.lqIdx.asUInt)}, tlb_miss ${io.tlb.resp.bits.miss}, " +
2020-10-26 17:31:38 +08:00
p "paddr ${Hexadecimal(load_s1.io.out.bits.paddr)}, mmio ${load_s1.io.out.bits.mmio}\n" )
2020-08-06 16:58:13 +08:00
2020-11-18 20:47:14 +08:00
// writeback to LSQ
2020-08-06 16:58:13 +08:00
// Current dcache use MSHR
2021-01-06 01:09:22 +08:00
// Load queue will be updated at s2 for both hit/miss int/fp load
2020-11-18 20:47:14 +08:00
io . lsq . loadIn . valid : = load_s2 . io . out . valid
2022-10-13 15:57:25 +08:00
// generate LqWriteBundle from LsPipelineBundle
io . lsq . loadIn . bits . fromLsPipelineBundle ( load_s2 . io . out . bits )
2022-12-02 22:35:02 +08:00
io . lsq . replayFast : = load_s1 . io . replayFast
io . lsq . replaySlow : = load_s2 . io . replaySlow
io . lsq . replaySlow . valid : = load_s2 . io . replaySlow . valid && ! load_s2 . io . out . bits . uop . robIdx . needFlush ( io . redirect )
2022-07-27 19:13:03 +08:00
// generate duplicated load queue data wen
val load_s2_valid_vec = RegInit ( 0. U ( 6. W ) )
val load_s2_leftFire = load_s1 . io . out . valid && load_s2 . io . in . ready
2022-12-02 22:35:02 +08:00
// val write_lq_safe = load_s2.io.write_lq_safe
2022-07-27 19:13:03 +08:00
load_s2_valid_vec : = 0x0 . U ( 6. W )
when ( load_s2_leftFire ) { load_s2_valid_vec : = 0x3f . U ( 6. W ) }
when ( load_s1 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) ) { load_s2_valid_vec : = 0x0 . U ( 6. W ) }
assert ( RegNext ( load_s2 . io . in . valid === load_s2_valid_vec ( 0 ) ) )
io . lsq . loadIn . bits . lq_data_wen_dup : = load_s2_valid_vec . asBools ( )
2021-01-15 22:24:39 +08:00
2022-11-18 14:52:30 +08:00
// s2_dcache_require_replay signal will be RegNexted, then used in s3
io . lsq . s2_dcache_require_replay : = load_s2 . io . s2_dcache_require_replay
2021-01-15 22:24:39 +08:00
// write to rob and writeback bus
2021-04-24 09:22:59 +08:00
val s2_wb_valid = load_s2 . io . out . valid && ! load_s2 . io . out . bits . miss && ! load_s2 . io . out . bits . mmio
2020-08-06 16:58:13 +08:00
2021-01-06 01:09:22 +08:00
// Int load, if hit, will be writebacked at s2
2021-02-19 22:41:55 +08:00
val hitLoadOut = Wire ( Valid ( new ExuOutput ) )
hitLoadOut . valid : = s2_wb_valid
hitLoadOut . bits . uop : = load_s2 . io . out . bits . uop
hitLoadOut . bits . data : = load_s2 . io . out . bits . data
hitLoadOut . bits . redirectValid : = false . B
hitLoadOut . bits . redirect : = DontCare
hitLoadOut . bits . debug . isMMIO : = load_s2 . io . out . bits . mmio
hitLoadOut . bits . debug . isPerfCnt : = false . B
hitLoadOut . bits . debug . paddr : = load_s2 . io . out . bits . paddr
2021-11-15 11:32:40 +08:00
hitLoadOut . bits . debug . vaddr : = load_s2 . io . out . bits . vaddr
2021-02-19 22:41:55 +08:00
hitLoadOut . bits . fflags : = DontCare
2020-08-06 16:58:13 +08:00
2020-10-17 21:05:46 +08:00
load_s2 . io . out . ready : = true . B
2021-01-06 01:09:22 +08:00
2022-07-27 04:16:04 +08:00
// load s3
2022-08-24 13:51:19 +08:00
val s3_load_wb_meta_reg = RegNext ( Mux ( hitLoadOut . valid , hitLoadOut . bits , io . lsq . ldout . bits ) )
// data from load queue refill
val s3_loadDataFromLQ = RegEnable ( io . lsq . ldRawData , io . lsq . ldout . valid )
val s3_rdataLQ = s3_loadDataFromLQ . mergedData ( )
val s3_rdataSelLQ = LookupTree ( s3_loadDataFromLQ . addrOffset , List (
"b000" . U -> s3_rdataLQ ( 63 , 0 ) ,
"b001" . U -> s3_rdataLQ ( 63 , 8 ) ,
"b010" . U -> s3_rdataLQ ( 63 , 16 ) ,
"b011" . U -> s3_rdataLQ ( 63 , 24 ) ,
"b100" . U -> s3_rdataLQ ( 63 , 32 ) ,
"b101" . U -> s3_rdataLQ ( 63 , 40 ) ,
"b110" . U -> s3_rdataLQ ( 63 , 48 ) ,
"b111" . U -> s3_rdataLQ ( 63 , 56 )
) )
val s3_rdataPartialLoadLQ = rdataHelper ( s3_loadDataFromLQ . uop , s3_rdataSelLQ )
// data from dcache hit
val s3_loadDataFromDcache = RegEnable ( load_s2 . io . loadDataFromDcache , load_s2 . io . in . valid )
val s3_rdataDcache = s3_loadDataFromDcache . mergedData ( )
val s3_rdataSelDcache = LookupTree ( s3_loadDataFromDcache . addrOffset , List (
"b000" . U -> s3_rdataDcache ( 63 , 0 ) ,
"b001" . U -> s3_rdataDcache ( 63 , 8 ) ,
"b010" . U -> s3_rdataDcache ( 63 , 16 ) ,
"b011" . U -> s3_rdataDcache ( 63 , 24 ) ,
"b100" . U -> s3_rdataDcache ( 63 , 32 ) ,
"b101" . U -> s3_rdataDcache ( 63 , 40 ) ,
"b110" . U -> s3_rdataDcache ( 63 , 48 ) ,
"b111" . U -> s3_rdataDcache ( 63 , 56 )
) )
val s3_rdataPartialLoadDcache = rdataHelper ( s3_loadDataFromDcache . uop , s3_rdataSelDcache )
io . ldout . bits : = s3_load_wb_meta_reg
io . ldout . bits . data : = Mux ( RegNext ( hitLoadOut . valid ) , s3_rdataPartialLoadDcache , s3_rdataPartialLoadLQ )
2022-08-22 19:02:28 +08:00
io . ldout . valid : = RegNext ( hitLoadOut . valid ) && ! RegNext ( load_s2 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) ) ||
2022-07-12 22:22:01 +08:00
RegNext ( io . lsq . ldout . valid ) && ! RegNext ( io . lsq . ldout . bits . uop . robIdx . needFlush ( io . redirect ) ) && ! RegNext ( hitLoadOut . valid )
2022-08-24 13:51:19 +08:00
io . ldout . bits . uop . cf . exceptionVec ( loadAccessFault ) : = s3_load_wb_meta_reg . uop . cf . exceptionVec ( loadAccessFault ) ||
2022-07-27 04:16:04 +08:00
RegNext ( hitLoadOut . valid ) && load_s2 . io . s3_delayed_load_error
2021-01-14 16:18:29 +08:00
2022-09-22 08:56:44 +08:00
// fast load to load forward
io . fastpathOut . valid : = RegNext ( load_s2 . io . out . valid ) // for debug only
io . fastpathOut . data : = s3_loadDataFromDcache . mergedData ( ) // fastpath is for ld only
2022-11-18 14:52:30 +08:00
// feedback tlb miss / dcache miss queue full
io . feedbackSlow . bits : = RegNext ( load_s2 . io . rsFeedback . bits )
io . feedbackSlow . valid : = RegNext ( load_s2 . io . rsFeedback . valid && ! load_s2 . io . out . bits . uop . robIdx . needFlush ( io . redirect ) )
// If replay is reported at load_s1, inst will be canceled (will not enter load_s2),
// in that case:
// * replay should not be reported twice
assert ( ! ( RegNext ( io . feedbackFast . valid ) && io . feedbackSlow . valid ) )
// * io.fastUop.valid should not be reported
2022-12-02 22:35:02 +08:00
assert ( ! RegNext ( io . feedbackFast . valid && ! io . feedbackFast . bits . hit && io . fastUop . valid ) )
2022-11-18 14:52:30 +08:00
2022-09-12 08:37:42 +08:00
// load forward_fail/ldld_violation check
// check for inst in load pipeline
2022-11-18 14:52:30 +08:00
val s3_forward_fail = RegNext ( io . lsq . forward . matchInvalid || io . sbuffer . matchInvalid )
val s3_ldld_violation = RegNext (
io . lsq . loadViolationQuery . resp . valid &&
io . lsq . loadViolationQuery . resp . bits . have_violation &&
RegNext ( io . csrCtrl . ldld_vio_check_enable )
)
val s3_need_replay_from_fetch = s3_forward_fail || s3_ldld_violation
2022-07-30 13:58:03 +08:00
val s3_can_replay_from_fetch = RegEnable ( load_s2 . io . s2_can_replay_from_fetch , load_s2 . io . out . valid )
2022-09-12 08:37:42 +08:00
// 1) use load pipe check result generated in load_s3 iff load_hit
when ( RegNext ( hitLoadOut . valid ) ) {
2022-11-18 14:52:30 +08:00
io . ldout . bits . uop . ctrl . replayInst : = s3_need_replay_from_fetch
}
2022-09-12 08:37:42 +08:00
// 2) otherwise, write check result to load queue
2022-11-18 14:52:30 +08:00
io . lsq . s3_replay_from_fetch : = s3_need_replay_from_fetch && s3_can_replay_from_fetch
2022-07-27 04:16:04 +08:00
// s3_delayed_load_error path is not used for now, as we writeback load result in load_s3
// but we keep this path for future use
io . s3_delayed_load_error : = false . B
2022-09-12 08:37:42 +08:00
io . lsq . s3_delayed_load_error : = false . B //load_s2.io.s3_delayed_load_error
2022-06-28 13:47:21 +08:00
2021-02-19 22:41:55 +08:00
io . lsq . ldout . ready : = ! hitLoadOut . valid
2020-08-16 15:59:15 +08:00
2021-12-22 16:54:40 +08:00
when ( io . feedbackSlow . valid && ! io . feedbackSlow . bits . hit ) {
2022-04-02 19:08:46 +08:00
// when need replay from rs, inst should not be writebacked to rob
2021-12-22 16:54:40 +08:00
assert ( RegNext ( ! hitLoadOut . valid ) )
2022-11-18 14:52:30 +08:00
assert ( RegNext ( ! io . lsq . loadIn . valid ) || RegNext ( load_s2 . io . s2_dcache_require_replay ) )
2021-12-22 16:54:40 +08:00
}
2022-08-22 19:02:28 +08:00
val lastValidData = RegEnable ( io . ldout . bits . data , io . ldout . fire )
2021-12-01 18:43:36 +08:00
val hitLoadAddrTriggerHitVec = Wire ( Vec ( 3 , Bool ( ) ) )
val lqLoadAddrTriggerHitVec = io . lsq . trigger . lqLoadAddrTriggerHitVec
( 0 until 3 ) . map { i => {
val tdata2 = io . trigger ( i ) . tdata2
val matchType = io . trigger ( i ) . matchType
val tEnable = io . trigger ( i ) . tEnable
2021-12-12 21:20:36 +08:00
2021-12-12 15:30:00 +08:00
hitLoadAddrTriggerHitVec ( i ) : = TriggerCmp ( load_s2 . io . out . bits . vaddr , tdata2 , matchType , tEnable )
2021-12-01 18:43:36 +08:00
io . trigger ( i ) . addrHit : = Mux ( hitLoadOut . valid , hitLoadAddrTriggerHitVec ( i ) , lqLoadAddrTriggerHitVec ( i ) )
io . trigger ( i ) . lastDataHit : = TriggerCmp ( lastValidData , tdata2 , matchType , tEnable )
} }
io . lsq . trigger . hitLoadAddrTriggerHitVec : = hitLoadAddrTriggerHitVec
2021-10-23 13:38:45 +08:00
val perfEvents = Seq (
2022-08-22 19:02:28 +08:00
( "load_s0_in_fire " , load_s0 . io . in . fire ) ,
( "load_to_load_forward " , load_s1 . io . out . valid && s1_tryPointerChasing && ! cancelPointerChasing ) ,
2021-10-23 13:38:45 +08:00
( "stall_dcache " , load_s0 . io . out . valid && load_s0 . io . out . ready && ! load_s0 . io . dcacheReq . ready ) ,
( "load_s1_in_fire " , load_s1 . io . in . fire ) ,
( "load_s1_tlb_miss " , load_s1 . io . in . fire && load_s1 . io . dtlbResp . bits . miss ) ,
( "load_s2_in_fire " , load_s2 . io . in . fire ) ,
( "load_s2_dcache_miss " , load_s2 . io . in . fire && load_s2 . io . dcacheResp . bits . miss ) ,
( "load_s2_replay " , load_s2 . io . rsFeedback . valid && ! load_s2 . io . rsFeedback . bits . hit ) ,
( "load_s2_replay_tlb_miss " , load_s2 . io . rsFeedback . valid && ! load_s2 . io . rsFeedback . bits . hit && load_s2 . io . in . bits . tlbMiss ) ,
( "load_s2_replay_cache " , load_s2 . io . rsFeedback . valid && ! load_s2 . io . rsFeedback . bits . hit && ! load_s2 . io . in . bits . tlbMiss && load_s2 . io . dcacheResp . bits . miss ) ,
)
2021-12-10 09:47:25 +08:00
generatePerfEvent ( )
2021-10-23 13:38:45 +08:00
2022-08-22 19:02:28 +08:00
when ( io . ldout . fire ) {
2021-01-06 01:09:22 +08:00
XSDebug ( "ldout %x\n" , io . ldout . bits . uop . cf . pc )
}
2021-01-08 17:17:13 +08:00
}