feat(outstanding): support nc outstanding and remove mmio st outstanding

This commit is contained in:
Yanqin Li 2024-11-19 17:35:13 +08:00 committed by zhanglinjuan
parent cfdd605feb
commit e04c5f647e
12 changed files with 582 additions and 433 deletions

View File

@ -205,6 +205,11 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
memBlock.io.redirect := backend.io.mem.redirect
memBlock.io.ooo_to_mem.csrCtrl := backend.io.mem.csrCtrl
// XXX lyq: remove this before PR
val tmp_debug_uncache_otsd = Constantin.createRecord("uncache_outstanding_enable", 0)
memBlock.io.ooo_to_mem.csrCtrl.uncache_write_outstanding_enable := tmp_debug_uncache_otsd
memBlock.io.ooo_to_mem.tlbCsr := backend.io.mem.tlbCsr
memBlock.io.ooo_to_mem.lsqio.lcommit := backend.io.mem.robLsqIO.lcommit
memBlock.io.ooo_to_mem.lsqio.scommit := backend.io.mem.robLsqIO.scommit

View File

@ -820,6 +820,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
// forward
loadUnits(i).io.lsq.forward <> lsq.io.forward(i)
loadUnits(i).io.sbuffer <> sbuffer.io.forward(i)
loadUnits(i).io.ubuffer <> uncache.io.forward(i)
loadUnits(i).io.tl_d_channel := dcache.io.lsu.forward_D(i)
loadUnits(i).io.forward_mshr <> dcache.io.lsu.forward_mshr(i)
// ld-ld violation check
@ -963,6 +964,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
hybridUnits(i).io.ldu_io.lsq.forward <> lsq.io.forward(LduCnt + i)
// forward
hybridUnits(i).io.ldu_io.sbuffer <> sbuffer.io.forward(LduCnt + i)
hybridUnits(i).io.ldu_io.ubuffer <> uncache.io.forward(LduCnt + i)
// hybridUnits(i).io.ldu_io.vec_forward <> vsFlowQueue.io.forward(LduCnt + i)
hybridUnits(i).io.ldu_io.vec_forward := DontCare
hybridUnits(i).io.ldu_io.tl_d_channel := dcache.io.lsu.forward_D(LduCnt + i)
@ -1332,8 +1334,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
is (s_idle) {
when (uncacheReq.fire) {
when (lsq.io.uncache.req.valid) {
val isStore = lsq.io.uncache.req.bits.cmd === MemoryOpConstants.M_XWR
when (!isStore || !io.ooo_to_mem.csrCtrl.uncache_write_outstanding_enable) {
when (!lsq.io.uncache.req.bits.nc || !io.ooo_to_mem.csrCtrl.uncache_write_outstanding_enable) {
uncacheState := s_scalar_uncache
}
}.otherwise {

View File

@ -515,6 +515,7 @@ class UncacheWordReq(implicit p: Parameters) extends DCacheBundle
{
val cmd = UInt(M_SZ.W)
val addr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W) // for uncache buffer forwarding
val data = UInt(XLEN.W)
val mask = UInt((XLEN/8).W)
val id = UInt(uncacheIdxBits.W)
@ -534,8 +535,9 @@ class UncacheWordResp(implicit p: Parameters) extends DCacheBundle
{
val data = UInt(XLEN.W)
val data_delayed = UInt(XLEN.W)
val id = UInt(uncacheIdxBits.W)
val nc = Bool()
val id = UInt(uncacheIdxBits.W) // resp identified signals
val nc = Bool() // resp identified signals
val is2lq = Bool() // resp identified signals
val miss = Bool()
val replay = Bool()
val tag_error = Bool()

View File

@ -22,6 +22,7 @@ import org.chipsalliance.cde.config.Parameters
import utils._
import utility._
import xiangshan._
import xiangshan.mem._
import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes}
import freechips.rocketchip.tilelink.{TLArbiter, TLBundleA, TLBundleD, TLClientNode, TLEdgeOut, TLMasterParameters, TLMasterPortParameters}
@ -33,8 +34,9 @@ class UncacheFlushBundle extends Bundle {
class UncacheEntry(implicit p: Parameters) extends DCacheBundle {
val cmd = UInt(M_SZ.W)
val addr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W)
val data = UInt(XLEN.W)
val mask = UInt((XLEN/8).W)
val mask = UInt(DataBytes.W)
val id = UInt(uncacheIdxBits.W)
val nc = Bool()
val atomic = Bool()
@ -43,9 +45,14 @@ class UncacheEntry(implicit p: Parameters) extends DCacheBundle {
val resp_data = UInt(XLEN.W)
val resp_nderr = Bool()
// FIXME lyq: Confirm the forward logic. if no forward, it can be removed
val fwd_data = UInt(XLEN.W)
val fwd_mask = UInt(DataBytes.W)
def set(x: UncacheWordReq): Unit = {
cmd := x.cmd
addr := x.addr
vaddr := x.vaddr
data := x.data
mask := x.mask
id := x.id
@ -53,6 +60,8 @@ class UncacheEntry(implicit p: Parameters) extends DCacheBundle {
atomic := x.atomic
resp_nderr := false.B
resp_data := 0.U
fwd_data := 0.U
fwd_mask := 0.U
}
def update(x: TLBundleD): Unit = {
@ -60,10 +69,18 @@ class UncacheEntry(implicit p: Parameters) extends DCacheBundle {
resp_nderr := x.denied
}
def update(forwardData: UInt, forwardMask: UInt): Unit = {
fwd_data := forwardData
fwd_mask := forwardMask
}
def toUncacheWordResp(): UncacheWordResp = {
val resp_fwd_data = VecInit((0 until DataBytes).map(j =>
Mux(fwd_mask(j), fwd_data(8*(j+1)-1, 8*j), resp_data(8*(j+1)-1, 8*j))
)).asUInt
val r = Wire(new UncacheWordResp)
r := DontCare
r.data := resp_data
r.data := resp_fwd_data
r.id := id
r.nderr := resp_nderr
r.nc := nc
@ -121,6 +138,7 @@ class UncacheIO(implicit p: Parameters) extends DCacheBundle {
val enableOutstanding = Input(Bool())
val flush = Flipped(new UncacheFlushBundle)
val lsq = Flipped(new UncacheWordIO)
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
}
// convert DCacheIO to TileLink
@ -182,10 +200,19 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer)
val uState = RegInit(s_idle)
def sizeMap[T <: Data](f: Int => T) = VecInit((0 until UncacheBufferSize).map(f))
def isStore(e: UncacheEntry): Bool = e.cmd === MemoryOpConstants.M_XWR
def isStore(x: UInt): Bool = x === MemoryOpConstants.M_XWR
// drain buffer
val empty = Wire(Bool())
val f0_needDrain = Wire(Bool())
val do_uarch_drain = RegNext(f0_needDrain)
val q0_entry = Wire(new UncacheEntry)
val q0_canSentIdx = Wire(UInt(INDEX_WIDTH.W))
val q0_canSent = Wire(Bool())
/******************************************************************
* uState for non-outstanding
******************************************************************/
@ -234,28 +261,37 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer)
2. how to merge store and response precisely
*/
val e0_invalids = sizeMap(i => !states(i).isValid())
val e0_invalid_oh = VecInit(PriorityEncoderOH(e0_invalids)).asUInt
val e0_fire = req.fire
val e0_req = req.bits
req_ready := e0_invalid_oh.orR
/**
TODO lyq: prohibit or wait or forward?
NOW: strict block by same address; otherwise: exhaustive consideration is needed.
- ld->ld wait
- ld->st forward
- st->ld forward
- st->st block
*/
val e0_existSameVec = sizeMap(j =>
e0_req.addr === entries(j).addr && states(j).isValid()
)
val e0_invalidVec = sizeMap(i => !states(i).isValid() && !e0_existSameVec(i))
val (e0_allocIdx, e0_canAlloc) = PriorityEncoderWithFlag(e0_invalidVec)
val e0_alloc = e0_canAlloc && e0_fire
req_ready := e0_invalidVec.asUInt.orR && !do_uarch_drain
for (i <- 0 until UncacheBufferSize) {
val alloc = e0_fire && e0_invalid_oh(i)
when(alloc){
entries(i).set(e0_req)
states(i).setValid(true.B)
// judge whether wait same block: e0 & q0
val waitSameVec = sizeMap(j =>
e0_req.addr === entries(j).addr && states(j).isValid() && states(j).isInflight()
)
val waitQ0 = e0_req.addr === q0_entry.addr && q0_canSent
when (waitSameVec.reduce(_ || _) || waitQ0) {
states(i).setWaitSame(true.B)
}
when (e0_alloc) {
entries(e0_allocIdx).set(e0_req)
states(e0_allocIdx).setValid(true.B)
// judge whether wait same block: e0 & q0
val waitSameVec = sizeMap(j =>
e0_req.addr === entries(j).addr && states(j).isValid() && states(j).isInflight()
)
val waitQ0 = e0_req.addr === q0_entry.addr && q0_canSent
when (waitSameVec.reduce(_ || _) || waitQ0) {
states(e0_allocIdx).setWaitSame(true.B)
}
}
@ -272,7 +308,7 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer)
******************************************************************/
val q0_canSentVec = sizeMap(i =>
// (io.enableOutstanding || uState === s_refill_req) && // FIXME lyq: comment for debug
(io.enableOutstanding || uState === s_refill_req) &&
states(i).can2Uncache()
)
val q0_res = PriorityEncoderWithFlag(q0_canSentVec)
@ -360,9 +396,75 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer)
* 1. when io.flush.valid is true
* 2. when io.lsq.req.bits.atomic is true
******************************************************************/
empty := !VecInit(states.map(_.isValid())).asUInt.orR
io.flush.empty := empty
val invalid_entries = PopCount(states.map(!_.isValid()))
io.flush.empty := invalid_entries === UncacheBufferSize.U
/******************************************************************
* Load Data Forward
*
* 0. ld in ldu pipeline
* f0: tag match, fast resp
* f1: data resp
*
* 1. ld in buffer (in "Enter Buffer")
* ld(en) -> st(in): ld entry.update, state.updateUncacheResp
* st(en) -> ld(in): ld entry.update, state.updateUncacheResp
* NOW: strict block by same address; there is no such forward.
*
******************************************************************/
val f0_validMask = sizeMap(i => isStore(entries(i)) && states(i).isValid())
val f0_tagMismatchVec = Wire(Vec(LoadPipelineWidth, Bool()))
f0_needDrain := f0_tagMismatchVec.asUInt.orR && !empty
for ((forward, i) <- io.forward.zipWithIndex) {
val f0_vtagMatches = sizeMap(w => entries(w).vaddr === forward.vaddr)
val f0_ptagMatches = sizeMap(w => entries(w).addr === forward.paddr)
f0_tagMismatchVec(i) := forward.valid && sizeMap(w =>
f0_vtagMatches(w) =/= f0_ptagMatches(w) && f0_validMask(w)
).asUInt.orR
when (f0_tagMismatchVec(i)) {
XSDebug("forward tag mismatch: pmatch %x vmatch %x vaddr %x paddr %x\n",
RegNext(f0_ptagMatches.asUInt),
RegNext(f0_vtagMatches.asUInt),
RegNext(forward.vaddr),
RegNext(forward.paddr)
)
}
val f0_validTagMatches = sizeMap(w => f0_ptagMatches(w) && f0_validMask(w) && forward.valid)
val f0_fwdMaskCandidates = VecInit(entries.map(e => e.mask))
val f0_fwdDataCandidates = VecInit(entries.map(e => e.data))
val f0_fwdMask = shiftMaskToHigh(
forward.paddr,
Mux1H(f0_validTagMatches, f0_fwdMaskCandidates)
).asTypeOf(Vec(VDataBytes, Bool()))
val f0_fwdData = shiftDataToHigh(
forward.paddr,
Mux1H(f0_validTagMatches, f0_fwdDataCandidates)
).asTypeOf(Vec(VDataBytes, UInt(8.W)))
val f1_fwdValid = RegNext(forward.valid)
val f1_fwdMask = RegEnable(f0_fwdMask, forward.valid)
val f1_fwdData = RegEnable(f0_fwdData, forward.valid)
forward.addrInvalid := false.B // addr in ubuffer is always ready
forward.dataInvalid := false.B // data in ubuffer is always ready
forward.matchInvalid := f0_tagMismatchVec(i) // paddr / vaddr cam result does not match
for (j <- 0 until VDataBytes) {
forward.forwardMaskFast(j) := f0_fwdMask(j)
forward.forwardMask(j) := false.B
forward.forwardData(j) := DontCare
when(f1_fwdMask(j) && f1_fwdValid) {
forward.forwardMask(j) := true.B
forward.forwardData(j) := f1_fwdData(j)
}
}
}
/******************************************************************
@ -386,18 +488,18 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer)
}
/* Performance Counters */
def isStore: Bool = io.lsq.req.bits.cmd === MemoryOpConstants.M_XWR
XSPerfAccumulate("uncache_mmio_store", io.lsq.req.fire && isStore && !io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_mmio_load", io.lsq.req.fire && !isStore && !io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_nc_store", io.lsq.req.fire && isStore && io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_nc_load", io.lsq.req.fire && !isStore && io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_mmio_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_mmio_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_nc_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_nc_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc)
XSPerfAccumulate("uncache_outstanding", uState =/= s_refill_req && mem_acquire.fire)
XSPerfAccumulate("vaddr_match_failed", PopCount(f0_tagMismatchVec))
val perfEvents = Seq(
("uncache_mmio_store", io.lsq.req.fire && isStore && !io.lsq.req.bits.nc),
("uncache_mmio_load", io.lsq.req.fire && !isStore && !io.lsq.req.bits.nc),
("uncache_nc_store", io.lsq.req.fire && isStore && io.lsq.req.bits.nc),
("uncache_nc_load", io.lsq.req.fire && !isStore && io.lsq.req.bits.nc),
("uncache_mmio_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc),
("uncache_mmio_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc),
("uncache_nc_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc),
("uncache_nc_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc),
("uncache_outstanding", uState =/= s_refill_req && mem_acquire.fire)
)

View File

@ -75,6 +75,16 @@ object shiftMaskToLow {
Mux(addr(3),(mask >> 8).asUInt,mask)
}
}
object shiftDataToHigh {
def apply(addr: UInt,data : UInt): UInt = {
Mux(addr(3), (data << 64).asUInt, data)
}
}
object shiftMaskToHigh {
def apply(addr: UInt,mask: UInt): UInt = {
Mux(addr(3), (mask << 8).asUInt, mask)
}
}
class LsPipelineBundle(implicit p: Parameters) extends XSBundle
with HasDCacheParameters

View File

@ -247,8 +247,10 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
switch(pendingstate){
is(s_idle){
when(io.uncache.req.fire){
pendingstate := Mux(loadQueue.io.uncache.req.valid, s_load,
Mux(io.uncacheOutstanding, s_idle, s_store))
pendingstate :=
Mux(io.uncacheOutstanding && io.uncache.req.bits.nc, s_idle,
Mux(loadQueue.io.uncache.req.valid, s_load,
s_store))
}
}
is(s_load){
@ -279,14 +281,10 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
io.uncache.req.valid := false.B
io.uncache.req.bits := DontCare
}
when (io.uncacheOutstanding) {
when (io.uncache.resp.bits.is2lq) {
io.uncache.resp <> loadQueue.io.uncache.resp
} .otherwise {
when(pendingstate === s_load){
io.uncache.resp <> loadQueue.io.uncache.resp
}.otherwise{
io.uncache.resp <> storeQueue.io.uncache.resp
}
io.uncache.resp <> storeQueue.io.uncache.resp
}
loadQueue.io.debugTopDown <> io.debugTopDown

View File

@ -304,6 +304,13 @@ class StoreQueue(implicit p: Parameters) extends XSModule
val mmioReq = Wire(chiselTypeOf(io.uncache.req))
val ncReq = Wire(chiselTypeOf(io.uncache.req))
val ncResp = Wire(chiselTypeOf(io.uncache.resp))
val ncDoReq = Wire(Bool())
val ncDoResp = Wire(Bool())
val ncReadNextTrigger = Mux(io.uncacheOutstanding, ncDoReq, ncDoResp)
// ncDoReq is double RegNexted, as ubuffer data write takes 3 cycles.
// TODO lyq: to eliminate coupling by passing signals through ubuffer
val ncDeqTrigger = Mux(io.uncacheOutstanding, RegNext(RegNext(ncDoReq)), ncDoResp)
val ncPtr = Mux(io.uncacheOutstanding, RegNext(RegNext(io.uncache.req.bits.id)), io.uncache.resp.bits.id)
// store miss align info
io.maControl.storeInfo.data := dataModule.io.rdata(0).data
@ -320,7 +327,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule
val rdataPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr))
rdataPtrExtNext := rdataPtrExt.map(i => i +
PopCount(dataBuffer.io.enq.map(_.fire)) +
PopCount(ncResp.fire || io.mmioStout.fire || io.vecmmioStout.fire)
PopCount(ncReadNextTrigger || io.mmioStout.fire || io.vecmmioStout.fire)
)
// deqPtrExtNext traces which inst is about to leave store queue
@ -334,12 +341,12 @@ class StoreQueue(implicit p: Parameters) extends XSModule
val deqPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr))
deqPtrExtNext := deqPtrExt.map(i => i +
RegNext(PopCount(VecInit(io.sbuffer.map(_.fire)))) +
PopCount(ncResp.fire || io.mmioStout.fire || io.vecmmioStout.fire)
PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire)
)
io.sqDeq := RegNext(
RegNext(PopCount(VecInit(io.sbuffer.map(_.fire && !misalignBlock)))) +
PopCount(ncResp.fire || io.mmioStout.fire || io.vecmmioStout.fire || finishMisalignSt)
PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire || finishMisalignSt)
)
assert(!RegNext(RegNext(io.sbuffer(0).fire) && (io.mmioStout.fire || io.vecmmioStout.fire)))
@ -804,11 +811,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule
}
is(s_req) {
when (mmioDoReq) {
when (io.uncacheOutstanding) {
mmioState := s_wb
} .otherwise {
mmioState := s_resp
}
mmioState := s_resp
}
}
is(s_resp) {
@ -841,6 +844,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule
mmioReq.bits := DontCare
mmioReq.bits.cmd := MemoryOpConstants.M_XWR
mmioReq.bits.addr := paddrModule.io.rdata(0) // data(deqPtr) -> rdata(0)
mmioReq.bits.vaddr:= vaddrModule.io.rdata(0)
mmioReq.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data)
mmioReq.bits.mask := shiftMaskToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).mask)
mmioReq.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value)
@ -855,7 +859,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule
// TODO: CAN NOT deal with vector nc now!
val nc_idle :: nc_req :: nc_resp :: Nil = Enum(3)
val ncState = RegInit(nc_idle)
val ncDoReq = io.uncache.req.fire && io.uncache.req.bits.nc
val rptr0 = rdataPtrExt(0).value
switch(ncState){
is(nc_idle) {
@ -865,7 +868,11 @@ class StoreQueue(implicit p: Parameters) extends XSModule
}
is(nc_req) {
when(ncDoReq) {
ncState := nc_resp
when(io.uncacheOutstanding) {
ncState := nc_idle
}.otherwise{
ncState := nc_resp
}
}
}
is(nc_resp) {
@ -874,23 +881,27 @@ class StoreQueue(implicit p: Parameters) extends XSModule
}
}
}
ncDoReq := io.uncache.req.fire && io.uncache.req.bits.nc
ncDoResp := ncResp.fire
ncReq.valid := ncState === nc_req
ncReq.bits := DontCare
ncReq.bits.cmd := MemoryOpConstants.M_XWR
ncReq.bits.addr := paddrModule.io.rdata(0)
ncReq.bits.vaddr:= vaddrModule.io.rdata(0)
ncReq.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data)
ncReq.bits.mask := shiftMaskToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).mask)
ncReq.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value)
ncReq.bits.nc := true.B
ncReq.bits.id := rdataPtrExt(0).value
ncReq.bits.id := rptr0
ncResp.ready := io.uncache.resp.ready
ncResp.valid := io.uncache.resp.fire && io.uncache.resp.bits.nc
ncResp.bits <> io.uncache.resp.bits
when (ncResp.fire) {
val ptr = io.uncache.resp.bits.id
allocated(ptr) := false.B
XSDebug("nc fire: ptr %d\n", ptr)
when (ncDeqTrigger) {
allocated(ncPtr) := false.B
XSDebug("nc fire: ptr %d\n", ncPtr)
}
mmioReq.ready := io.uncache.req.ready

View File

@ -129,6 +129,7 @@ class IOBufferEntry(entryIndex: Int)(implicit p: Parameters) extends XSModule
io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD
io.uncache.req.bits.data := DontCare
io.uncache.req.bits.addr := req.paddr
io.uncache.req.bits.vaddr:= req.vaddr
io.uncache.req.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0))
io.uncache.req.bits.id := io.id
io.uncache.req.bits.instrtype := DontCare
@ -241,7 +242,7 @@ class IOBuffer(implicit p: Parameters) extends XSModule
allocWidth = LoadPipelineWidth,
freeWidth = 4,
enablePreAlloc = true,
moduleName = "UncacheBuffer freelist"
moduleName = "IOBuffer freelist"
))
freeList.io := DontCare

View File

@ -1,4 +1,4 @@
/***************************************************************************************
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
@ -13,420 +13,421 @@
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
package xiangshan.mem
import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config._
import xiangshan._
import xiangshan.backend.rob.{RobPtr, RobLsqIO}
import xiangshan.ExceptionNO._
import xiangshan.cache._
import utils._
import utility._
import xiangshan.backend.Bundles
import xiangshan.backend.Bundles.{DynInst, MemExuOutput}
import xiangshan.backend.fu.FuConfig.LduCfg
import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config._
import xiangshan._
import xiangshan.backend.rob.{RobPtr, RobLsqIO}
import xiangshan.ExceptionNO._
import xiangshan.cache._
import utils._
import utility._
import xiangshan.backend.Bundles
import xiangshan.backend.Bundles.{DynInst, MemExuOutput}
import xiangshan.backend.fu.FuConfig.LduCfg
class NCBufferEntry(entryIndex: Int)(implicit p: Parameters) extends XSModule
with HasCircularQueuePtrHelper
with HasLoadHelper
{
val io = IO(new Bundle() {
val id = Input(UInt())
class NCBufferEntry(entryIndex: Int)(implicit p: Parameters) extends XSModule
with HasCircularQueuePtrHelper
with HasLoadHelper
{
val io = IO(new Bundle() {
val id = Input(UInt())
val redirect = Flipped(Valid(new Redirect))
val redirect = Flipped(Valid(new Redirect))
// client requests
val req = Flipped(Valid(new LqWriteBundle))
// client requests
val req = Flipped(Valid(new LqWriteBundle))
// rerequest nc_with_data to loadunit
val ncOut = DecoupledIO(new LsPipelineBundle)
// rerequest nc_with_data to loadunit
val ncOut = DecoupledIO(new LsPipelineBundle)
// uncache io
val uncache = new UncacheWordIO
// uncache io
val uncache = new UncacheWordIO
// flush this entry
val flush = Output(Bool())
// flush this entry
val flush = Output(Bool())
// exception generated by outer bus
val exception = Valid(new LqWriteBundle)
})
// exception generated by outer bus
val exception = Valid(new LqWriteBundle)
})
val req_valid = RegInit(false.B)
val req = Reg(new LqWriteBundle)
val req_valid = RegInit(false.B)
val req = Reg(new LqWriteBundle)
val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4)
val uncacheState = RegInit(s_idle)
val uncacheData = Reg(io.uncache.resp.bits.data.cloneType)
val nderr = RegInit(false.B)
val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4)
val uncacheState = RegInit(s_idle)
val uncacheData = Reg(io.uncache.resp.bits.data.cloneType)
val nderr = RegInit(false.B)
// enqueue
when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
req_valid := false.B
} .elsewhen (io.req.valid) {
XSError(req_valid, p"UncacheNCBuffer: You can not write an valid entry: $entryIndex")
req_valid := true.B
req := io.req.bits
nderr := false.B
} .elsewhen (io.ncOut.fire) {
req_valid := false.B
}
// enqueue
when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
req_valid := false.B
} .elsewhen (io.req.valid) {
XSError(req_valid, p"UncacheNCBuffer: You can not write an valid entry: $entryIndex")
req_valid := true.B
req := io.req.bits
nderr := false.B
} .elsewhen (io.ncOut.fire) {
req_valid := false.B
}
io.flush := req_valid && req.uop.robIdx.needFlush(io.redirect)
/**
* NC operations
*
* States:
* (1) s_idle: wait for nc req from loadunit
* (2) s_req: wait to be sent to uncache channel until getting new nc req and uncache ready
* (3) s_resp: wait for response from uncache channel
* (4) s_wait: wait loadunit for A to receive nc_with_data req
*/
io.flush := req_valid && req.uop.robIdx.needFlush(io.redirect)
/**
* NC operations
*
* States:
* (1) s_idle: wait for nc req from loadunit
* (2) s_req: wait to be sent to uncache channel until getting new nc req and uncache ready
* (3) s_resp: wait for response from uncache channel
* (4) s_wait: wait loadunit for A to receive nc_with_data req
*/
switch (uncacheState) {
is (s_idle) {
when (req_valid) {
uncacheState := s_req
}
}
is (s_req) {
when (io.uncache.req.fire) {
uncacheState := s_resp
}
}
is (s_resp) {
when (io.uncache.resp.fire) {
uncacheState := s_wait
}
}
is (s_wait) {
when (io.ncOut.fire) {
uncacheState := s_idle // ready for next mmio
}
}
}
switch (uncacheState) {
is (s_idle) {
when (req_valid) {
uncacheState := s_req
}
}
is (s_req) {
when (io.uncache.req.fire) {
uncacheState := s_resp
}
}
is (s_resp) {
when (io.uncache.resp.fire) {
uncacheState := s_wait
}
}
is (s_wait) {
when (io.ncOut.fire) {
uncacheState := s_idle // ready for next mmio
}
}
}
io.uncache.req.valid := uncacheState === s_req
io.uncache.req.bits := DontCare
io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD
io.uncache.req.bits.data := DontCare
io.uncache.req.bits.addr := req.paddr
io.uncache.req.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0))
io.uncache.req.bits.id := io.id
io.uncache.req.bits.instrtype := DontCare
io.uncache.req.bits.replayCarry := DontCare
io.uncache.req.bits.atomic := false.B
io.uncache.req.bits.nc := true.B
io.uncache.req.valid := uncacheState === s_req
io.uncache.req.bits := DontCare
io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD
io.uncache.req.bits.data := DontCare
io.uncache.req.bits.addr := req.paddr
io.uncache.req.bits.vaddr:= req.vaddr
io.uncache.req.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0))
io.uncache.req.bits.id := io.id
io.uncache.req.bits.instrtype := DontCare
io.uncache.req.bits.replayCarry := DontCare
io.uncache.req.bits.atomic := false.B
io.uncache.req.bits.nc := true.B
io.uncache.resp.ready := true.B
io.uncache.resp.ready := true.B
when (io.uncache.req.fire) {
XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n",
req.uop.pc,
io.uncache.req.bits.addr,
io.uncache.req.bits.data,
io.uncache.req.bits.cmd,
io.uncache.req.bits.mask
)
}
when (io.uncache.req.fire) {
XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n",
req.uop.pc,
io.uncache.req.bits.addr,
io.uncache.req.bits.data,
io.uncache.req.bits.cmd,
io.uncache.req.bits.mask
)
}
// (3) response from uncache channel
when (io.uncache.resp.fire) {
uncacheData := io.uncache.resp.bits.data
nderr := io.uncache.resp.bits.nderr
}
// (3) response from uncache channel
when (io.uncache.resp.fire) {
uncacheData := io.uncache.resp.bits.data
nderr := io.uncache.resp.bits.nderr
}
// uncache writeback
val selUop = req.uop
val func = selUop.fuOpType
val raddr = req.paddr
val rdataSel = LookupTree(raddr(2, 0), List(
"b000".U -> uncacheData(63, 0),
"b001".U -> uncacheData(63, 8),
"b010".U -> uncacheData(63, 16),
"b011".U -> uncacheData(63, 24),
"b100".U -> uncacheData(63, 32),
"b101".U -> uncacheData(63, 40),
"b110".U -> uncacheData(63, 48),
"b111".U -> uncacheData(63, 56)
))
val rdataPartialLoad = rdataHelper(selUop, rdataSel)
// uncache writeback
val selUop = req.uop
val func = selUop.fuOpType
val raddr = req.paddr
val rdataSel = LookupTree(raddr(2, 0), List(
"b000".U -> uncacheData(63, 0),
"b001".U -> uncacheData(63, 8),
"b010".U -> uncacheData(63, 16),
"b011".U -> uncacheData(63, 24),
"b100".U -> uncacheData(63, 32),
"b101".U -> uncacheData(63, 40),
"b110".U -> uncacheData(63, 48),
"b111".U -> uncacheData(63, 56)
))
val rdataPartialLoad = rdataHelper(selUop, rdataSel)
io.ncOut.valid := (uncacheState === s_wait)
io.ncOut.bits := DontCare
io.ncOut.bits.uop := selUop
io.ncOut.bits.uop.lqIdx := req.uop.lqIdx
io.ncOut.bits.uop.exceptionVec(loadAccessFault) := nderr
io.ncOut.bits.data := rdataPartialLoad
io.ncOut.bits.paddr := req.paddr
io.ncOut.bits.vaddr := req.vaddr
io.ncOut.bits.nc := true.B
io.ncOut.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0))
io.ncOut.bits.schedIndex := req.schedIndex
io.ncOut.bits.isvec := req.isvec
io.ncOut.bits.is128bit := req.is128bit
io.ncOut.bits.vecActive := req.vecActive
io.ncOut.valid := (uncacheState === s_wait)
io.ncOut.bits := DontCare
io.ncOut.bits.uop := selUop
io.ncOut.bits.uop.lqIdx := req.uop.lqIdx
io.ncOut.bits.uop.exceptionVec(loadAccessFault) := nderr
io.ncOut.bits.data := rdataPartialLoad
io.ncOut.bits.paddr := req.paddr
io.ncOut.bits.vaddr := req.vaddr
io.ncOut.bits.nc := true.B
io.ncOut.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0))
io.ncOut.bits.schedIndex := req.schedIndex
io.ncOut.bits.isvec := req.isvec
io.ncOut.bits.is128bit := req.is128bit
io.ncOut.bits.vecActive := req.vecActive
io.exception.valid := io.ncOut.fire
io.exception.bits := req
io.exception.bits.uop.exceptionVec(loadAccessFault) := nderr
io.exception.valid := io.ncOut.fire
io.exception.bits := req
io.exception.bits.uop.exceptionVec(loadAccessFault) := nderr
when (io.ncOut.fire) {
req_valid := false.B
when (io.ncOut.fire) {
req_valid := false.B
XSInfo("int load miss write to cbd robidx %d lqidx %d pc 0x%x mmio %x\n",
io.ncOut.bits.uop.robIdx.asUInt,
io.ncOut.bits.uop.lqIdx.asUInt,
io.ncOut.bits.uop.pc,
true.B
)
}
XSInfo("int load miss write to cbd robidx %d lqidx %d pc 0x%x mmio %x\n",
io.ncOut.bits.uop.robIdx.asUInt,
io.ncOut.bits.uop.lqIdx.asUInt,
io.ncOut.bits.uop.pc,
true.B
)
}
// end
}
// end
}
class NCBuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle() {
// control
val redirect = Flipped(Valid(new Redirect))
class NCBuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle() {
// control
val redirect = Flipped(Valid(new Redirect))
//from loadunit
val req = Vec(LoadPipelineWidth, Flipped(Valid(new LqWriteBundle)))
//from loadunit
val req = Vec(LoadPipelineWidth, Flipped(Valid(new LqWriteBundle)))
//to loadunit: return response of nc with data
val ncOut = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle))
//to loadunit: return response of nc with data
val ncOut = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle))
// uncache io
val uncache = new UncacheWordIO
// uncache io
val uncache = new UncacheWordIO
// rollback from frontend when NCBuffer is full
val rollback = Output(Valid(new Redirect))
// rollback from frontend when NCBuffer is full
val rollback = Output(Valid(new Redirect))
// exception generated by outer bus
val exception = Valid(new LqWriteBundle)
})
// exception generated by outer bus
val exception = Valid(new LqWriteBundle)
})
val entries = Seq.tabulate(LoadNCBufferSize)(i => Module(new NCBufferEntry(i)))
val entries = Seq.tabulate(LoadNCBufferSize)(i => Module(new NCBufferEntry(i)))
// freelist: store valid entries index.
// +---+---+--------------+-----+-----+
// | 0 | 1 | ...... | n-2 | n-1 |
// +---+---+--------------+-----+-----+
val freeList = Module(new FreeList(
size = LoadNCBufferSize,
allocWidth = LoadPipelineWidth,
freeWidth = 4,
enablePreAlloc = true,
moduleName = "NCBuffer freelist"
))
freeList.io := DontCare
// freelist: store valid entries index.
// +---+---+--------------+-----+-----+
// | 0 | 1 | ...... | n-2 | n-1 |
// +---+---+--------------+-----+-----+
val freeList = Module(new FreeList(
size = LoadNCBufferSize,
allocWidth = LoadPipelineWidth,
freeWidth = 4,
enablePreAlloc = true,
moduleName = "NCBuffer freelist"
))
freeList.io := DontCare
// set enqueue default
entries.foreach {
case (e) =>
e.io.req.valid := false.B
e.io.req.bits := DontCare
}
// set enqueue default
entries.foreach {
case (e) =>
e.io.req.valid := false.B
e.io.req.bits := DontCare
}
// set uncache default
io.uncache.req.valid := false.B
io.uncache.req.bits := DontCare
io.uncache.resp.ready := false.B
// set uncache default
io.uncache.req.valid := false.B
io.uncache.req.bits := DontCare
io.uncache.resp.ready := false.B
entries.foreach {
case (e) =>
e.io.uncache.req.ready := false.B
e.io.uncache.resp.valid := false.B
e.io.uncache.resp.bits := DontCare
}
entries.foreach {
case (e) =>
e.io.uncache.req.ready := false.B
e.io.uncache.resp.valid := false.B
e.io.uncache.resp.bits := DontCare
}
// set writeback default
for (w <- 0 until LoadPipelineWidth) {
io.ncOut(w).valid := false.B
io.ncOut(w).bits := DontCare
}
// set writeback default
for (w <- 0 until LoadPipelineWidth) {
io.ncOut(w).valid := false.B
io.ncOut(w).bits := DontCare
}
// enqueue
// s1:
val s1_req = VecInit(io.req.map(_.bits))
val s1_valid = VecInit(io.req.map(_.valid))
// enqueue
// s1:
val s1_req = VecInit(io.req.map(_.bits))
val s1_valid = VecInit(io.req.map(_.valid))
// s2: enqueue
val s2_req = (0 until LoadPipelineWidth).map(i => {
RegEnable(s1_req(i), s1_valid(i))})
val s2_valid = (0 until LoadPipelineWidth).map(i => {
RegNext(s1_valid(i)) &&
!s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
!s2_req(i).uop.robIdx.needFlush(io.redirect)
})
val s2_has_exception = s2_req.map(x => ExceptionNO.selectByFu(x.uop.exceptionVec, LduCfg).asUInt.orR)
val s2_need_replay = s2_req.map(_.rep_info.need_rep)
// s2: enqueue
val s2_req = (0 until LoadPipelineWidth).map(i => {
RegEnable(s1_req(i), s1_valid(i))})
val s2_valid = (0 until LoadPipelineWidth).map(i => {
RegNext(s1_valid(i)) &&
!s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
!s2_req(i).uop.robIdx.needFlush(io.redirect)
})
val s2_has_exception = s2_req.map(x => ExceptionNO.selectByFu(x.uop.exceptionVec, LduCfg).asUInt.orR)
val s2_need_replay = s2_req.map(_.rep_info.need_rep)
val s2_enqueue = Wire(Vec(LoadPipelineWidth, Bool()))
for (w <- 0 until LoadPipelineWidth) {
s2_enqueue(w) := s2_valid(w) && !s2_has_exception(w) && !s2_need_replay(w) && s2_req(w).nc
}
val s2_enqueue = Wire(Vec(LoadPipelineWidth, Bool()))
for (w <- 0 until LoadPipelineWidth) {
s2_enqueue(w) := s2_valid(w) && !s2_has_exception(w) && !s2_need_replay(w) && s2_req(w).nc
}
//
val enqValidVec = Wire(Vec(LoadPipelineWidth, Bool()))
val enqIndexVec = Wire(Vec(LoadPipelineWidth, UInt()))
//
val enqValidVec = Wire(Vec(LoadPipelineWidth, Bool()))
val enqIndexVec = Wire(Vec(LoadPipelineWidth, UInt()))
for (w <- 0 until LoadPipelineWidth) {
freeList.io.allocateReq(w) := true.B
}
for (w <- 0 until LoadPipelineWidth) {
freeList.io.allocateReq(w) := true.B
}
// freeList real-allocate
for (w <- 0 until LoadPipelineWidth) {
freeList.io.doAllocate(w) := enqValidVec(w)
}
// freeList real-allocate
for (w <- 0 until LoadPipelineWidth) {
freeList.io.doAllocate(w) := enqValidVec(w)
}
for (w <- 0 until LoadPipelineWidth) {
enqValidVec(w) := s2_enqueue(w) && freeList.io.canAllocate(w)
for (w <- 0 until LoadPipelineWidth) {
enqValidVec(w) := s2_enqueue(w) && freeList.io.canAllocate(w)
val offset = PopCount(s2_enqueue.take(w))
enqIndexVec(w) := freeList.io.allocateSlot(offset)
}
val offset = PopCount(s2_enqueue.take(w))
enqIndexVec(w) := freeList.io.allocateSlot(offset)
}
// TODO lyq: It's best to choose in robIdx order
val uncacheReqArb = Module(new RRArbiterInit(io.uncache.req.bits.cloneType, LoadNCBufferSize))
val ncOutArb = Module(new RRArbiterInit(io.ncOut(0).bits.cloneType, LoadNCBufferSize))
// TODO lyq: It's best to choose in robIdx order / the order in which they enter
val uncacheReqArb = Module(new RRArbiterInit(io.uncache.req.bits.cloneType, LoadNCBufferSize))
val ncOutArb = Module(new RRArbiterInit(io.ncOut(0).bits.cloneType, LoadNCBufferSize))
entries.zipWithIndex.foreach {
case (e, i) =>
e.io.redirect <> io.redirect
e.io.id := i.U
entries.zipWithIndex.foreach {
case (e, i) =>
e.io.redirect <> io.redirect
e.io.id := i.U
// enqueue
for (w <- 0 until LoadPipelineWidth) {
when (enqValidVec(w) && (i.U === enqIndexVec(w))) {
e.io.req.valid := true.B
e.io.req.bits := s2_req(w)
}
}
// enqueue
for (w <- 0 until LoadPipelineWidth) {
when (enqValidVec(w) && (i.U === enqIndexVec(w))) {
e.io.req.valid := true.B
e.io.req.bits := s2_req(w)
}
}
// uncache logic
uncacheReqArb.io.in(i).valid := e.io.uncache.req.valid
uncacheReqArb.io.in(i).bits := e.io.uncache.req.bits
e.io.uncache.req.ready := uncacheReqArb.io.in(i).ready
ncOutArb.io.in(i).valid := e.io.ncOut.valid
ncOutArb.io.in(i).bits := e.io.ncOut.bits
e.io.ncOut.ready := ncOutArb.io.in(i).ready
// uncache logic
uncacheReqArb.io.in(i).valid := e.io.uncache.req.valid
uncacheReqArb.io.in(i).bits := e.io.uncache.req.bits
e.io.uncache.req.ready := uncacheReqArb.io.in(i).ready
ncOutArb.io.in(i).valid := e.io.ncOut.valid
ncOutArb.io.in(i).bits := e.io.ncOut.bits
e.io.ncOut.ready := ncOutArb.io.in(i).ready
when (i.U === io.uncache.resp.bits.id) {
e.io.uncache.resp <> io.uncache.resp
}
}
when (i.U === io.uncache.resp.bits.id) {
e.io.uncache.resp <> io.uncache.resp
}
}
// uncache Request
AddPipelineReg(uncacheReqArb.io.out, io.uncache.req, false.B)
// uncache Request
AddPipelineReg(uncacheReqArb.io.out, io.uncache.req, false.B)
// uncache Writeback
AddPipelineReg(ncOutArb.io.out, io.ncOut(0), false.B)
// uncache Writeback
AddPipelineReg(ncOutArb.io.out, io.ncOut(0), false.B)
// uncache exception
io.exception.valid := Cat(entries.map(_.io.exception.valid)).orR
io.exception.bits := ParallelPriorityMux(entries.map(e =>
(e.io.exception.valid, e.io.exception.bits)
))
// uncache exception
io.exception.valid := Cat(entries.map(_.io.exception.valid)).orR
io.exception.bits := ParallelPriorityMux(entries.map(e =>
(e.io.exception.valid, e.io.exception.bits)
))
// UncacheBuffer deallocate
val freeMaskVec = Wire(Vec(LoadNCBufferSize, Bool()))
// UncacheBuffer deallocate
val freeMaskVec = Wire(Vec(LoadNCBufferSize, Bool()))
// init
freeMaskVec.map(e => e := false.B)
// init
freeMaskVec.map(e => e := false.B)
// dealloc logic
entries.zipWithIndex.foreach {
case (e, i) =>
when (e.io.ncOut.fire || e.io.flush) {
freeMaskVec(i) := true.B
}
}
// dealloc logic
entries.zipWithIndex.foreach {
case (e, i) =>
when (e.io.ncOut.fire || e.io.flush) {
freeMaskVec(i) := true.B
}
}
freeList.io.free := freeMaskVec.asUInt
freeList.io.free := freeMaskVec.asUInt
/**
* Uncache rollback detection
*
* When uncache loads enqueue, it searches uncache loads, They can not enqueue and need re-execution.
*
* Cycle 0: uncache enqueue.
* Cycle 1: Select oldest uncache loads.
* Cycle 2: Redirect Fire.
* Choose the oldest load from LoadPipelineWidth oldest loads.
* Prepare redirect request according to the detected rejection.
* Fire redirect request (if valid)
*/
// Load_S3 .... Load_S3
// stage 0: lq lq
// | | (can not enqueue)
// stage 1: lq lq
// | |
// ---------------
// |
// stage 2: lq
// |
// rollback req
def selectOldestRedirect(xs: Seq[Valid[Redirect]]): Vec[Bool] = {
val compareVec = (0 until xs.length).map(i => (0 until i).map(j => isAfter(xs(j).bits.robIdx, xs(i).bits.robIdx)))
val resultOnehot = VecInit((0 until xs.length).map(i => Cat((0 until xs.length).map(j =>
(if (j < i) !xs(j).valid || compareVec(i)(j)
else if (j == i) xs(i).valid
else !xs(j).valid || !compareVec(j)(i))
)).andR))
resultOnehot
}
val reqNeedCheck = VecInit((0 until LoadPipelineWidth).map(w =>
s2_enqueue(w) && !enqValidVec(w)
))
val reqSelUops = VecInit(s2_req.map(_.uop))
val allRedirect = (0 until LoadPipelineWidth).map(i => {
val redirect = Wire(Valid(new Redirect))
redirect.valid := reqNeedCheck(i)
redirect.bits := DontCare
redirect.bits.isRVC := reqSelUops(i).preDecodeInfo.isRVC
redirect.bits.robIdx := reqSelUops(i).robIdx
redirect.bits.ftqIdx := reqSelUops(i).ftqPtr
redirect.bits.ftqOffset := reqSelUops(i).ftqOffset
redirect.bits.level := RedirectLevel.flush
redirect.bits.cfiUpdate.target := reqSelUops(i).pc // TODO: check if need pc
redirect.bits.debug_runahead_checkpoint_id := reqSelUops(i).debugInfo.runahead_checkpoint_id
redirect
})
val oldestOneHot = selectOldestRedirect(allRedirect)
val oldestRedirect = Mux1H(oldestOneHot, allRedirect)
val lastCycleRedirect = Wire(Valid(new Redirect))
lastCycleRedirect.valid := RegNext(io.redirect.valid)
lastCycleRedirect.bits := RegEnable(io.redirect.bits, io.redirect.valid)
val lastLastCycleRedirect = Wire(Valid(new Redirect))
lastLastCycleRedirect.valid := RegNext(lastCycleRedirect.valid)
lastLastCycleRedirect.bits := RegEnable(lastCycleRedirect.bits, lastCycleRedirect.valid)
io.rollback.valid := GatedValidRegNext(oldestRedirect.valid &&
!oldestRedirect.bits.robIdx.needFlush(io.redirect) &&
!oldestRedirect.bits.robIdx.needFlush(lastCycleRedirect) &&
!oldestRedirect.bits.robIdx.needFlush(lastLastCycleRedirect))
io.rollback.bits := RegEnable(oldestRedirect.bits, oldestRedirect.valid)
/**
* Uncache rollback detection
*
* When uncache loads enqueue, it searches uncache loads, They can not enqueue and need re-execution.
*
* Cycle 0: uncache enqueue.
* Cycle 1: Select oldest uncache loads.
* Cycle 2: Redirect Fire.
* Choose the oldest load from LoadPipelineWidth oldest loads.
* Prepare redirect request according to the detected rejection.
* Fire redirect request (if valid)
*/
// Load_S3 .... Load_S3
// stage 0: lq lq
// | | (can not enqueue)
// stage 1: lq lq
// | |
// ---------------
// |
// stage 2: lq
// |
// rollback req
def selectOldestRedirect(xs: Seq[Valid[Redirect]]): Vec[Bool] = {
val compareVec = (0 until xs.length).map(i => (0 until i).map(j => isAfter(xs(j).bits.robIdx, xs(i).bits.robIdx)))
val resultOnehot = VecInit((0 until xs.length).map(i => Cat((0 until xs.length).map(j =>
(if (j < i) !xs(j).valid || compareVec(i)(j)
else if (j == i) xs(i).valid
else !xs(j).valid || !compareVec(j)(i))
)).andR))
resultOnehot
}
val reqNeedCheck = VecInit((0 until LoadPipelineWidth).map(w =>
s2_enqueue(w) && !enqValidVec(w)
))
val reqSelUops = VecInit(s2_req.map(_.uop))
val allRedirect = (0 until LoadPipelineWidth).map(i => {
val redirect = Wire(Valid(new Redirect))
redirect.valid := reqNeedCheck(i)
redirect.bits := DontCare
redirect.bits.isRVC := reqSelUops(i).preDecodeInfo.isRVC
redirect.bits.robIdx := reqSelUops(i).robIdx
redirect.bits.ftqIdx := reqSelUops(i).ftqPtr
redirect.bits.ftqOffset := reqSelUops(i).ftqOffset
redirect.bits.level := RedirectLevel.flush
redirect.bits.cfiUpdate.target := reqSelUops(i).pc // TODO: check if need pc
redirect.bits.debug_runahead_checkpoint_id := reqSelUops(i).debugInfo.runahead_checkpoint_id
redirect
})
val oldestOneHot = selectOldestRedirect(allRedirect)
val oldestRedirect = Mux1H(oldestOneHot, allRedirect)
val lastCycleRedirect = Wire(Valid(new Redirect))
lastCycleRedirect.valid := RegNext(io.redirect.valid)
lastCycleRedirect.bits := RegEnable(io.redirect.bits, io.redirect.valid)
val lastLastCycleRedirect = Wire(Valid(new Redirect))
lastLastCycleRedirect.valid := RegNext(lastCycleRedirect.valid)
lastLastCycleRedirect.bits := RegEnable(lastCycleRedirect.bits, lastCycleRedirect.valid)
io.rollback.valid := GatedValidRegNext(oldestRedirect.valid &&
!oldestRedirect.bits.robIdx.needFlush(io.redirect) &&
!oldestRedirect.bits.robIdx.needFlush(lastCycleRedirect) &&
!oldestRedirect.bits.robIdx.needFlush(lastLastCycleRedirect))
io.rollback.bits := RegEnable(oldestRedirect.bits, oldestRedirect.valid)
// perf counter
val validCount = freeList.io.validCount
val allowEnqueue = !freeList.io.empty
QueuePerf(LoadNCBufferSize, validCount, !allowEnqueue)
// perf counter
val validCount = freeList.io.validCount
val allowEnqueue = !freeList.io.empty
QueuePerf(LoadNCBufferSize, validCount, !allowEnqueue)
XSPerfAccumulate("ncReqCycle", VecInit(uncacheReqArb.io.in.map(_.fire)).asUInt.orR)
XSPerfAccumulate("ncUncacheReqCnt", io.uncache.req.fire)
XSPerfAccumulate("nc_writeback_success", io.ncOut(0).fire)
XSPerfAccumulate("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready)
XSPerfAccumulate("uncache_full_rollback", io.rollback.valid)
XSPerfAccumulate("ncReqCycle", VecInit(uncacheReqArb.io.in.map(_.fire)).asUInt.orR)
XSPerfAccumulate("ncUncacheReqCnt", io.uncache.req.fire)
XSPerfAccumulate("nc_writeback_success", io.ncOut(0).fire)
XSPerfAccumulate("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready)
XSPerfAccumulate("uncache_full_rollback", io.rollback.valid)
val perfEvents: Seq[(String, UInt)] = Seq(
("ncReqCycle", VecInit(uncacheReqArb.io.in.map(_.fire)).asUInt.orR),
("ncUncacheReqCnt", io.uncache.req.fire),
("nc_writeback_success", io.ncOut(0).fire),
("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready),
("uncache_full_rollback", io.rollback.valid)
)
// end
}
val perfEvents: Seq[(String, UInt)] = Seq(
("ncReqCycle", VecInit(uncacheReqArb.io.in.map(_.fire)).asUInt.orR),
("ncUncacheReqCnt", io.uncache.req.fire),
("nc_writeback_success", io.ncOut(0).fire),
("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready),
("uncache_full_rollback", io.rollback.valid)
)
// end
}

View File

@ -62,6 +62,7 @@ class HybridUnit(implicit p: Parameters) extends XSModule
// data path
val sbuffer = new LoadForwardQueryIO
val ubuffer = new LoadForwardQueryIO
val vec_forward = new LoadForwardQueryIO
val lsq = new LoadToLsqIO
val tl_d_channel = Input(new DcacheToLduForwardIO)
@ -608,6 +609,14 @@ class HybridUnit(implicit p: Parameters) extends XSModule
io.ldu_io.sbuffer.mask := s1_in.mask
io.ldu_io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it
io.ldu_io.ubuffer.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_fast_rep_kill || s1_prf || !s1_ld_flow)
io.ldu_io.ubuffer.vaddr := s1_vaddr
io.ldu_io.ubuffer.paddr := s1_paddr_dup_lsu
io.ldu_io.ubuffer.uop := s1_in.uop
io.ldu_io.ubuffer.sqIdx := s1_in.uop.sqIdx
io.ldu_io.ubuffer.mask := s1_in.mask
io.ldu_io.ubuffer.pc := s1_in.uop.pc // FIXME: remove it
io.ldu_io.vec_forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_fast_rep_kill || s1_prf || !s1_ld_flow)
io.ldu_io.vec_forward.vaddr := s1_vaddr
io.ldu_io.vec_forward.paddr := s1_paddr_dup_lsu
@ -970,16 +979,12 @@ class HybridUnit(implicit p: Parameters) extends XSModule
s2_full_fwd := ((~s2_fwd_mask.asUInt).asUInt & s2_in.mask) === 0.U && !io.ldu_io.lsq.forward.dataInvalid && !io.ldu_io.vec_forward.dataInvalid
// generate XLEN/8 Muxs
for (i <- 0 until VLEN / 8) {
s2_fwd_mask(i) := io.ldu_io.lsq.forward.forwardMask(i) || io.ldu_io.sbuffer.forwardMask(i) || io.ldu_io.vec_forward.forwardMask(i)
s2_fwd_data(i) := Mux(
io.ldu_io.lsq.forward.forwardMask(i),
io.ldu_io.lsq.forward.forwardData(i),
Mux(
io.ldu_io.vec_forward.forwardMask(i),
io.ldu_io.vec_forward.forwardData(i),
io.ldu_io.sbuffer.forwardData(i)
)
)
s2_fwd_mask(i) := io.ldu_io.lsq.forward.forwardMask(i) || io.ldu_io.sbuffer.forwardMask(i) || io.ldu_io.vec_forward.forwardMask(i) || io.ldu_io.ubuffer.forwardMask(i)
s2_fwd_data(i) :=
Mux(io.ldu_io.lsq.forward.forwardMask(i), io.ldu_io.lsq.forward.forwardData(i),
Mux(io.ldu_io.vec_forward.forwardMask(i), io.ldu_io.vec_forward.forwardData(i),
Mux(io.ldu_io.ubuffer.forwardMask(i), io.ldu_io.ubuffer.forwardData(i),
io.ldu_io.sbuffer.forwardData(i))))
}
XSDebug(s2_fire && s2_ld_flow, "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n",
@ -1159,7 +1164,7 @@ class HybridUnit(implicit p: Parameters) extends XSModule
io.ldu_io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err
io.ldu_io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep
val s3_vp_match_fail = RegNext(io.ldu_io.lsq.forward.matchInvalid || io.ldu_io.sbuffer.matchInvalid) && s3_troublem
val s3_vp_match_fail = RegNext(io.ldu_io.lsq.forward.matchInvalid || io.ldu_io.sbuffer.matchInvalid || io.ldu_io.ubuffer.matchInvalid) && s3_troublem
val s3_ldld_rep_inst =
io.ldu_io.lsq.ldld_nuke_query.resp.valid &&
io.ldu_io.lsq.ldld_nuke_query.resp.bits.rep_frm_fetch &&

View File

@ -132,6 +132,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now
val dcache = new DCacheLoadIO
val sbuffer = new LoadForwardQueryIO
val ubuffer = new LoadForwardQueryIO
val lsq = new LoadToLsqIO
val tl_d_channel = Input(new DcacheToLduForwardIO)
val forward_mshr = Flipped(new LduToMissqueueForwardIO)
@ -926,6 +927,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule
io.sbuffer.mask := s1_in.mask
io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it
io.ubuffer.valid := s1_valid && s1_nc_with_data && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.ubuffer.vaddr := s1_vaddr
io.ubuffer.paddr := s1_paddr_dup_lsu
io.ubuffer.uop := s1_in.uop
io.ubuffer.sqIdx := s1_in.uop.sqIdx
io.ubuffer.mask := s1_in.mask
io.ubuffer.pc := s1_in.uop.pc // FIXME: remove it
io.lsq.forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.lsq.forward.vaddr := s1_vaddr
io.lsq.forward.paddr := s1_paddr_dup_lsu
@ -1244,7 +1253,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s2_data_fwded = s2_dcache_miss && s2_full_fwd
val s2_vp_match_fail = (io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid) && s2_troublem
val s2_fwd_vp_match_invalid = io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid || io.ubuffer.matchInvalid
val s2_vp_match_fail = s2_fwd_vp_match_invalid && s2_troublem
val s2_safe_wakeup = !s2_out.rep_info.need_rep && !s2_mmio && (!s2_in.nc || s2_nc_with_data) && !s2_mis_align && !s2_exception // don't need to replay and is not a mmio\misalign no data
val s2_safe_writeback = s2_exception || s2_safe_wakeup || s2_vp_match_fail
@ -1271,8 +1281,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule
s2_full_fwd := ((~s2_fwd_mask.asUInt).asUInt & s2_in.mask) === 0.U && !io.lsq.forward.dataInvalid
// generate XLEN/8 Muxs
for (i <- 0 until VLEN / 8) {
s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i)
s2_fwd_data(i) := Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), io.sbuffer.forwardData(i))
s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i) || io.ubuffer.forwardMask(i)
s2_fwd_data(i) :=
Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i),
Mux(s2_nc_with_data, io.ubuffer.forwardData(i),
io.sbuffer.forwardData(i)))
}
XSDebug(s2_fire, "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n",
@ -1458,7 +1471,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep
io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err
val s3_vp_match_fail = GatedValidRegNext(io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid) && s3_troublem
val s3_vp_match_fail = GatedValidRegNext(s2_fwd_vp_match_invalid) && s3_troublem
val s3_rep_frm_fetch = s3_vp_match_fail
val s3_ldld_rep_inst =
io.lsq.ldld_nuke_query.resp.valid &&

View File

@ -303,7 +303,7 @@ class Sbuffer(implicit p: Parameters)
// sbuffer_in_s1:
// * read data and meta from fifo queue
// * update sbuffer meta (vtag, ptag, flag)
// * prevert that line from being sent to dcache (add a block condition)
// * prevent that line from being sent to dcache (add a block condition)
// * prepare cacheline level write enable signal, RegNext() data and mask
// sbuffer_in_s2: