XiangShan/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala

1629 lines
78 KiB
Scala
Raw Normal View History

/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
2020-08-06 16:58:13 +08:00
package xiangshan.mem
2023-10-08 16:16:14 +08:00
import org.chipsalliance.cde.config.Parameters
2020-08-06 16:58:13 +08:00
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan.ExceptionNO._
2020-08-06 16:58:13 +08:00
import xiangshan._
import xiangshan.backend.Bundles.{DynInst, MemExuInput, MemExuOutput}
import xiangshan.backend.fu.PMPRespBundle
import xiangshan.backend.fu.FuConfig._
import xiangshan.backend.ctrlblock.{DebugLsInfoBundle, LsTopdownInfo}
import xiangshan.backend.rob.RobPtr
import xiangshan.backend.ctrlblock.DebugLsInfoBundle
import xiangshan.backend.fu.util.SdtrigExt
import xiangshan.cache._
import xiangshan.cache.wpu.ReplayCarry
import xiangshan.cache.mmu._
import xiangshan.mem.mdp._
class LoadToLsqReplayIO(implicit p: Parameters) extends XSBundle
with HasDCacheParameters
with HasTlbConst
{
// mshr refill index
val mshr_id = UInt(log2Up(cfg.nMissEntries).W)
// get full data from store queue and sbuffer
val full_fwd = Bool()
// wait for data from store inst's store queue index
val data_inv_sq_idx = new SqPtr
// wait for address from store queue index
val addr_inv_sq_idx = new SqPtr
// replay carry
val rep_carry = new ReplayCarry(nWays)
// data in last beat
val last_beat = Bool()
// replay cause
val cause = Vec(LoadReplayCauses.allCauses, Bool())
// performance debug information
val debug = new PerfDebugInfo
// tlb hint
val tlb_id = UInt(log2Up(loadfiltersize).W)
val tlb_full = Bool()
// alias
def mem_amb = cause(LoadReplayCauses.C_MA)
def tlb_miss = cause(LoadReplayCauses.C_TM)
def fwd_fail = cause(LoadReplayCauses.C_FF)
def dcache_rep = cause(LoadReplayCauses.C_DR)
def dcache_miss = cause(LoadReplayCauses.C_DM)
def wpu_fail = cause(LoadReplayCauses.C_WF)
def bank_conflict = cause(LoadReplayCauses.C_BC)
def rar_nack = cause(LoadReplayCauses.C_RAR)
def raw_nack = cause(LoadReplayCauses.C_RAW)
def nuke = cause(LoadReplayCauses.C_NK)
2023-07-18 11:53:47 +08:00
def need_rep = cause.asUInt.orR
}
class LoadToLsqIO(implicit p: Parameters) extends XSBundle {
val ldin = DecoupledIO(new LqWriteBundle)
val uncache = Flipped(DecoupledIO(new MemExuOutput))
val ld_raw_data = Input(new LoadDataFromLQBundle)
val forward = new PipeLoadForwardQueryIO
val stld_nuke_query = new LoadNukeQueryIO
val ldld_nuke_query = new LoadNukeQueryIO
val trigger = Flipped(new LqTriggerIO)
2020-08-06 16:58:13 +08:00
}
class LoadToLoadIO(implicit p: Parameters) extends XSBundle {
val valid = Bool()
val data = UInt(XLEN.W) // load to load fast path is limited to ld (64 bit) used as vaddr src1 only
val dly_ld_err = Bool()
}
class LoadUnitTriggerIO(implicit p: Parameters) extends XSBundle {
val tdata2 = Input(UInt(64.W))
val matchType = Input(UInt(2.W))
val tEnable = Input(Bool()) // timing is calculated before this
val addrHit = Output(Bool())
}
class LoadUnit(implicit p: Parameters) extends XSModule
with HasLoadHelper
with HasPerfEvents
with HasDCacheParameters
with HasCircularQueuePtrHelper
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
with HasVLSUParameters
with SdtrigExt
{
2020-08-06 16:58:13 +08:00
val io = IO(new Bundle() {
// control
val redirect = Flipped(ValidIO(new Redirect))
val csrCtrl = Flipped(new CustomCSRCtrlIO)
// int issue path
val ldin = Flipped(Decoupled(new MemExuInput))
val ldout = Decoupled(new MemExuOutput)
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// vec issue path
val vecldin = Flipped(Decoupled(new VecPipeBundle))
2024-03-29 10:36:11 +08:00
val vecldout = Decoupled(new VecPipelineFeedbackIO(isVStore = false))
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// misalignBuffer issue path
val misalign_ldin = Flipped(Decoupled(new LsPipelineBundle))
val misalign_ldout = Valid(new LqWriteBundle)
// data path
val tlb = new TlbRequestIO(2)
val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now
val dcache = new DCacheLoadIO
val sbuffer = new LoadForwardQueryIO
val lsq = new LoadToLsqIO
val tl_d_channel = Input(new DcacheToLduForwardIO)
val forward_mshr = Flipped(new LduToMissqueueForwardIO)
// val refill = Flipped(ValidIO(new Refill))
2023-07-18 11:53:47 +08:00
val l2_hint = Input(Valid(new L2ToL1Hint))
val tlb_hint = Flipped(new TlbHintReq)
// fast wakeup
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// TODO: implement vector fast wakeup
val fast_uop = ValidIO(new DynInst) // early wakeup signal generated in load_s1, send to RS in load_s2
// trigger
val trigger = Vec(TriggerNum, new LoadUnitTriggerIO)
// prefetch
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
val prefetch_train = ValidIO(new LdPrefetchTrainBundle()) // provide prefetch info to sms
val prefetch_train_l1 = ValidIO(new LdPrefetchTrainBundle()) // provide prefetch info to stream & stride
// speculative for gated control
val s1_prefetch_spec = Output(Bool())
val s2_prefetch_spec = Output(Bool())
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
val prefetch_req = Flipped(ValidIO(new L1PrefetchReq)) // hardware prefetch to l1 cache req
val canAcceptLowConfPrefetch = Output(Bool())
val canAcceptHighConfPrefetch = Output(Bool())
// ifetchPrefetch
val ifetchPrefetch = ValidIO(new SoftIfetchPrefetchBundle)
// load to load fast path
val l2l_fwd_in = Input(new LoadToLoadIO)
val l2l_fwd_out = Output(new LoadToLoadIO)
val ld_fast_match = Input(Bool())
val ld_fast_fuOpType = Input(UInt())
val ld_fast_imm = Input(UInt(12.W))
// rs feedback
val wakeup = ValidIO(new DynInst)
val feedback_fast = ValidIO(new RSFeedback) // stage 2
val feedback_slow = ValidIO(new RSFeedback) // stage 3
2023-09-12 01:11:11 +08:00
val ldCancel = Output(new LoadCancelIO()) // use to cancel the uops waked by this load, and cancel load
// load ecc error
val s3_dly_ld_err = Output(Bool()) // Note that io.s3_dly_ld_err and io.lsq.s3_dly_ld_err is different
// schedule error query
val stld_nuke_query = Flipped(Vec(StorePipelineWidth, Valid(new StoreNukeQueryIO)))
// queue-based replay
val replay = Flipped(Decoupled(new LsPipelineBundle))
val lq_rep_full = Input(Bool())
// misc
val s2_ptr_chasing = Output(Bool()) // provide right pc for hw prefetch
// Load fast replay path
val fast_rep_in = Flipped(Decoupled(new LqWriteBundle))
val fast_rep_out = Decoupled(new LqWriteBundle)
// to misalign buffer
val misalign_buf = Valid(new LqWriteBundle)
// Load RAR rollback
val rollback = Valid(new Redirect)
// perf
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
val debug_ls = Output(new DebugLsInfoBundle)
val lsTopdownInfo = Output(new LsTopdownInfo)
val correctMissTrain = Input(Bool())
2020-08-06 16:58:13 +08:00
})
val s1_ready, s2_ready, s3_ready = WireInit(false.B)
// Pipeline
// --------------------------------------------------------------------------------
// stage 0
// --------------------------------------------------------------------------------
// generate addr, use addr to query DCache and DTLB
val s0_valid = Wire(Bool())
2024-01-03 10:42:03 +08:00
val s0_mmio_select = Wire(Bool())
val s0_kill = Wire(Bool())
val s0_can_go = s1_ready
val s0_fire = s0_valid && s0_can_go
2024-01-03 10:42:03 +08:00
val s0_mmio_fire = s0_mmio_select && s0_can_go
val s0_out = Wire(new LqWriteBundle)
val s0_tlb_vaddr = Wire(UInt(VAddrBits.W))
val s0_dcache_vaddr = Wire(UInt(VAddrBits.W))
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
// flow source bundle
class FlowSource extends Bundle {
val vaddr = UInt(VAddrBits.W)
val mask = UInt((VLEN/8).W)
val uop = new DynInst
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val try_l2l = Bool()
val has_rob_entry = Bool()
val rep_carry = new ReplayCarry(nWays)
val mshrid = UInt(log2Up(cfg.nMissEntries).W)
val isFirstIssue = Bool()
val fast_rep = Bool()
val ld_rep = Bool()
val l2l_fwd = Bool()
val prf = Bool()
val prf_rd = Bool()
val prf_wr = Bool()
val prf_i = Bool()
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val sched_idx = UInt(log2Up(LoadQueueReplaySize+1).W)
2024-03-25 21:15:46 +08:00
val hlv = Bool()
val hlvx = Bool()
2023-12-20 16:19:28 +08:00
// Record the issue port idx of load issue queue. This signal is used by load cancel.
val deqPortIdx = UInt(log2Ceil(LoadPipelineWidth).W)
val frm_mabuf = Bool()
2023-12-20 16:19:28 +08:00
// vec only
val isvec = Bool()
val is128bit = Bool()
val uop_unit_stride_fof = Bool()
val reg_offset = UInt(vOffsetBits.W)
val vecActive = Bool() // 1: vector active element or scala mem operation, 0: vector not active element
2023-12-20 16:19:28 +08:00
val is_first_ele = Bool()
// val flowPtr = new VlflowPtr
2024-03-29 10:36:11 +08:00
val usSecondInv = Bool()
val mbIndex = UInt(vlmBindexBits.W)
val elemIdx = UInt(elemIdxBits.W)
val elemIdxInsideVd = UInt(elemIdxBits.W)
val alignedType = UInt(alignTypeBits.W)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
}
val s0_sel_src = Wire(new FlowSource)
// load flow select/gen
// src0: misalignBuffer load (io.misalign_ldin)
// src1: super load replayed by LSQ (cache miss replay) (io.replay)
// src2: fast load replay (io.fast_rep_in)
// src3: mmio (io.lsq.uncache)
// src4: load replayed by LSQ (io.replay)
// src5: hardware prefetch from prefetchor (high confidence) (io.prefetch)
// NOTE: Now vec/int loads are sent from same RS
// A vec load will be splited into multiple uops,
// so as long as one uop is issued,
// the other uops should have higher priority
// src6: vec read from RS (io.vecldin)
// src7: int read / software prefetch first issue from RS (io.in)
// src8: load try pointchaising when no issued or replayed load (io.fastpath)
// src9: hardware prefetch from prefetchor (high confidence) (io.prefetch)
// priority: high to low
val s0_rep_stall = io.ldin.valid && isAfter(io.replay.bits.uop.robIdx, io.ldin.bits.uop.robIdx)
private val SRC_NUM = 10
private val Seq(
mab_idx, super_rep_idx, fast_rep_idx, mmio_idx, lsq_rep_idx,
high_pf_idx, vec_iss_idx, int_iss_idx, l2l_fwd_idx, low_pf_idx
) = (0 until SRC_NUM).toSeq
// load flow source valid
val s0_src_valid_vec = WireInit(VecInit(Seq(
io.misalign_ldin.valid,
io.replay.valid && io.replay.bits.forward_tlDchannel,
io.fast_rep_in.valid,
io.lsq.uncache.valid,
io.replay.valid && !io.replay.bits.forward_tlDchannel && !s0_rep_stall,
io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U,
io.vecldin.valid,
io.ldin.valid, // int flow first issue or software prefetch
io.l2l_fwd_in.valid,
io.prefetch_req.valid && io.prefetch_req.bits.confidence === 0.U,
)))
// load flow source ready
val s0_src_ready_vec = Wire(Vec(SRC_NUM, Bool()))
s0_src_ready_vec(0) := true.B
for(i <- 1 until SRC_NUM){
s0_src_ready_vec(i) := !s0_src_valid_vec.take(i).reduce(_ || _)
}
// load flow source select (OH)
val s0_src_select_vec = WireInit(VecInit((0 until SRC_NUM).map{i => s0_src_valid_vec(i) && s0_src_ready_vec(i)}))
val s0_hw_prf_select = s0_src_select_vec(high_pf_idx) || s0_src_select_vec(low_pf_idx)
dontTouch(s0_src_valid_vec)
dontTouch(s0_src_ready_vec)
dontTouch(s0_src_select_vec)
s0_valid := (
s0_src_valid_vec(mab_idx) ||
s0_src_valid_vec(super_rep_idx) ||
s0_src_valid_vec(fast_rep_idx) ||
s0_src_valid_vec(lsq_rep_idx) ||
s0_src_valid_vec(high_pf_idx) ||
s0_src_valid_vec(vec_iss_idx) ||
s0_src_valid_vec(int_iss_idx) ||
s0_src_valid_vec(l2l_fwd_idx) ||
s0_src_valid_vec(low_pf_idx)
) && !s0_src_select_vec(mmio_idx) && io.dcache.req.ready && !s0_kill
s0_mmio_select := s0_src_select_vec(mmio_idx) && !s0_kill
// which is S0's out is ready and dcache is ready
val s0_try_ptr_chasing = s0_src_select_vec(l2l_fwd_idx)
val s0_do_try_ptr_chasing = s0_try_ptr_chasing && s0_can_go && io.dcache.req.ready
val s0_ptr_chasing_vaddr = io.l2l_fwd_in.data(5, 0) +& io.ld_fast_imm(5, 0)
val s0_ptr_chasing_canceled = WireInit(false.B)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s0_kill := s0_ptr_chasing_canceled
// prefetch related ctrl signal
io.canAcceptLowConfPrefetch := s0_src_ready_vec(low_pf_idx) && io.dcache.req.ready
io.canAcceptHighConfPrefetch := s0_src_ready_vec(high_pf_idx) && io.dcache.req.ready
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
2023-07-18 11:53:47 +08:00
// query DTLB
io.tlb.req.valid := s0_valid && !s0_hw_prf_select && !s0_sel_src.prf_i // if is hardware prefetch, don't send valid to tlb, but need no_translate
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.tlb.req.bits.cmd := Mux(s0_sel_src.prf,
Mux(s0_sel_src.prf_wr, TlbCmd.write, TlbCmd.read),
TlbCmd.read
)
io.tlb.req.bits.vaddr := s0_tlb_vaddr
2024-03-25 21:15:46 +08:00
io.tlb.req.bits.hyperinst := s0_sel_src.hlv
io.tlb.req.bits.hlvx := s0_sel_src.hlvx
io.tlb.req.bits.size := Mux(s0_sel_src.isvec, s0_sel_src.alignedType(2,0), LSUOpType.size(s0_sel_src.uop.fuOpType))
io.tlb.req.bits.kill := s0_kill
io.tlb.req.bits.memidx.is_ld := true.B
io.tlb.req.bits.memidx.is_st := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.tlb.req.bits.memidx.idx := s0_sel_src.uop.lqIdx.value
io.tlb.req.bits.debug.robIdx := s0_sel_src.uop.robIdx
io.tlb.req.bits.no_translate := s0_hw_prf_select // hw b.reqetch addr does not need to be translated, need this signal for pmp check
io.tlb.req.bits.debug.pc := s0_sel_src.uop.pc
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.tlb.req.bits.debug.isFirstIssue := s0_sel_src.isFirstIssue
// query DCache
io.dcache.req.valid := s0_valid && !s0_sel_src.prf_i
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.dcache.req.bits.cmd := Mux(s0_sel_src.prf_rd,
2023-07-18 11:53:47 +08:00
MemoryOpConstants.M_PFR,
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
Mux(s0_sel_src.prf_wr, MemoryOpConstants.M_PFW, MemoryOpConstants.M_XRD)
)
io.dcache.req.bits.vaddr := s0_dcache_vaddr
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.dcache.req.bits.mask := s0_sel_src.mask
io.dcache.req.bits.data := DontCare
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.dcache.req.bits.isFirstIssue := s0_sel_src.isFirstIssue
io.dcache.req.bits.instrtype := Mux(s0_sel_src.prf, DCACHE_PREFETCH_SOURCE.U, LOAD_SOURCE.U)
io.dcache.req.bits.debug_robIdx := s0_sel_src.uop.robIdx.value
io.dcache.req.bits.replayCarry := s0_sel_src.rep_carry
io.dcache.req.bits.id := DontCare // TODO: update cache meta
io.dcache.req.bits.lqIdx := s0_sel_src.uop.lqIdx
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
io.dcache.pf_source := Mux(s0_hw_prf_select, io.prefetch_req.bits.pf_source.value, L1_HW_PREFETCH_NULL)
io.dcache.is128Req := s0_sel_src.is128bit
// load flow priority mux
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
def fromNullSource(): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out
}
def fromMisAlignBufferSource(src: LsPipelineBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.vaddr := src.vaddr
out.mask := src.mask
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := false.B
out.rep_carry := src.replayCarry
out.mshrid := src.mshrid
out.frm_mabuf := true.B
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.sched_idx := src.schedIndex
out.isvec := false.B
out.is128bit := src.is128bit
out.vecActive := true.B
out.hlv := LSUOpType.isHlv(src.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(src.uop.fuOpType)
out
}
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
def fromFastReplaySource(src: LqWriteBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := src.mask
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := src.hasROBEntry
out.rep_carry := src.rep_info.rep_carry
out.mshrid := src.rep_info.mshr_id
out.frm_mabuf := src.isFrmMisAlignBuf
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.isFirstIssue := false.B
out.fast_rep := true.B
out.ld_rep := src.isLoadReplay
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType) && !src.isvec
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.uop_unit_stride_fof := src.uop_unit_stride_fof
out.reg_offset := src.reg_offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mbIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.alignedType := src.alignedType
out.hlv := LSUOpType.isHlv(src.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(src.uop.fuOpType)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out
}
// TODO: implement vector mmio
2024-01-03 10:42:03 +08:00
def fromMmioSource(src: MemExuOutput) = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := 0.U
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := false.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
2024-01-03 10:42:03 +08:00
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
2024-01-03 10:42:03 +08:00
out.sched_idx := 0.U
out.hlv := LSUOpType.isHlv(src.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(src.uop.fuOpType)
2024-01-03 10:42:03 +08:00
out.vecActive := true.B
out
}
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
def fromNormalReplaySource(src: LsPipelineBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := Mux(src.isvec, src.mask, genVWmask(src.vaddr, src.uop.fuOpType(1, 0)))
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := true.B
out.rep_carry := src.replayCarry
out.mshrid := src.mshrid
out.frm_mabuf := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := true.B
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType) && !src.isvec
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.uop_unit_stride_fof := src.uop_unit_stride_fof
out.reg_offset := src.reg_offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mbIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.alignedType := src.alignedType
out.hlv := LSUOpType.isHlv(src.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(src.uop.fuOpType)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out
}
// TODO: implement vector prefetch
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
def fromPrefetchSource(src: L1PrefetchReq): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := 0.U
out.uop := DontCare
out.try_l2l := false.B
out.has_rob_entry := false.B
2024-01-03 10:42:03 +08:00
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.mshrid := 0.U
out.frm_mabuf := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := true.B
out.prf_rd := !src.is_store
out.prf_wr := src.is_store
out.prf_i := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.sched_idx := 0.U
out
}
def fromVecIssueSource(src: VecPipeBundle): FlowSource = {
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := src.mask
out.uop := src.uop
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.try_l2l := false.B
out.has_rob_entry := true.B
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// TODO: VLSU, implement replay carry
2024-01-03 10:42:03 +08:00
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.mshrid := 0.U
out.frm_mabuf := false.B
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// TODO: VLSU, implement first issue
// out.isFirstIssue := src.isFirstIssue
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.sched_idx := 0.U
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// Vector load interface
out.isvec := true.B
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// vector loads only access a single element at a time, so 128-bit path is not used for now
out.is128bit := is128Bit(src.alignedType)
out.uop_unit_stride_fof := src.uop_unit_stride_fof
// out.rob_idx_valid := src.rob_idx_valid
// out.inner_idx := src.inner_idx
// out.rob_idx := src.rob_idx
out.reg_offset := src.reg_offset
// out.offset := src.offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
// out.flowPtr := src.flowPtr
2024-03-29 10:36:11 +08:00
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mBIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.alignedType := src.alignedType
out.hlv := false.B
out.hlvx := false.B
out
}
def fromIntIssueSource(src: MemExuInput): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
val addr = io.ldin.bits.src(0) + SignExt(io.ldin.bits.uop.imm(11, 0), VAddrBits)
out.mask := genVWmask(addr, src.uop.fuOpType(1,0))
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := true.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
out.isFirstIssue := true.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType)
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := src.uop.fuOpType === LSUOpType.prefetch_i
out.sched_idx := 0.U
out.hlv := LSUOpType.isHlv(src.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(src.uop.fuOpType)
out.vecActive := true.B // true for scala load
2023-12-20 16:19:28 +08:00
out
}
// TODO: implement vector l2l
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
def fromLoadToLoadSource(src: LoadToLoadIO): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := genVWmask(0.U, LSUOpType.ld)
// When there's no valid instruction from RS and LSQ, we try the load-to-load forwarding.
// Assume the pointer chasing is always ld.
out.uop.fuOpType := LSUOpType.ld
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.try_l2l := true.B
// we dont care out.isFirstIssue and out.rsIdx and s0_sqIdx in S0 when trying pointchasing
// because these signals will be updated in S1
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.has_rob_entry := false.B
out.mshrid := 0.U
out.frm_mabuf := false.B
2024-01-03 10:42:03 +08:00
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.isFirstIssue := true.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := true.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out.sched_idx := 0.U
out.hlv := LSUOpType.isHlv(out.uop.fuOpType)
out.hlvx := LSUOpType.isHlvx(out.uop.fuOpType)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
out
}
// set default
val s0_src_selector = WireInit(s0_src_valid_vec)
if (!EnableLoadToLoadForward) { s0_src_selector(l2l_fwd_idx) := false.B }
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s0_src_format = Seq(
fromMisAlignBufferSource(io.misalign_ldin.bits),
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
fromNormalReplaySource(io.replay.bits),
fromFastReplaySource(io.fast_rep_in.bits),
2024-01-03 10:42:03 +08:00
fromMmioSource(io.lsq.uncache.bits),
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
fromNormalReplaySource(io.replay.bits),
fromPrefetchSource(io.prefetch_req.bits),
fromVecIssueSource(io.vecldin.bits),
fromIntIssueSource(io.ldin.bits),
(if (EnableLoadToLoadForward) fromLoadToLoadSource(io.l2l_fwd_in) else fromNullSource()),
fromPrefetchSource(io.prefetch_req.bits)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
)
s0_sel_src := ParallelPriorityMux(s0_src_selector, s0_src_format)
2020-10-17 21:05:46 +08:00
val s0_addr_selector = Seq(
s0_src_valid_vec(mab_idx),
s0_src_valid_vec(super_rep_idx),
s0_src_valid_vec(fast_rep_idx),
s0_src_valid_vec(lsq_rep_idx),
s0_src_valid_vec(vec_iss_idx),
s0_src_valid_vec(int_iss_idx),
(if (EnableLoadToLoadForward) s0_src_valid_vec(l2l_fwd_idx) else false.B),
)
val s0_addr_format = Seq(
io.misalign_ldin.bits.vaddr,
io.replay.bits.vaddr,
io.fast_rep_in.bits.vaddr,
io.replay.bits.vaddr,
io.vecldin.bits.vaddr,
io.ldin.bits.src(0) + SignExt(io.ldin.bits.uop.imm(11, 0), VAddrBits),
(if (EnableLoadToLoadForward) Cat(io.l2l_fwd_in.data(XLEN-1, 6), s0_ptr_chasing_vaddr(5,0)) else 0.U(VAddrBits.W)),
)
s0_tlb_vaddr := ParallelPriorityMux(s0_addr_selector, s0_addr_format)
s0_dcache_vaddr := Mux(s0_hw_prf_select, io.prefetch_req.bits.getVaddr(), s0_tlb_vaddr)
2023-01-29 16:29:30 +08:00
// address align check
val s0_addr_aligned = LookupTree(Mux(s0_sel_src.isvec, s0_sel_src.alignedType(1,0), s0_sel_src.uop.fuOpType(1, 0)), List(
2023-01-29 16:29:30 +08:00
"b00".U -> true.B, //b
"b01".U -> (s0_dcache_vaddr(0) === 0.U), //h
"b10".U -> (s0_dcache_vaddr(1, 0) === 0.U), //w
"b11".U -> (s0_dcache_vaddr(2, 0) === 0.U) //d
2023-07-18 11:53:47 +08:00
))
XSError(s0_sel_src.isvec && s0_dcache_vaddr(3, 0) =/= 0.U && s0_sel_src.alignedType(2), "unit-stride 128 bit element is not aligned!")
// accept load flow if dcache ready (tlb is always ready)
// TODO: prefetch need writeback to loadQueueFlag
s0_out := DontCare
s0_out.vaddr := s0_dcache_vaddr
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s0_out.mask := s0_sel_src.mask
s0_out.uop := s0_sel_src.uop
s0_out.isFirstIssue := s0_sel_src.isFirstIssue
s0_out.hasROBEntry := s0_sel_src.has_rob_entry
s0_out.isPrefetch := s0_sel_src.prf
s0_out.isHWPrefetch := s0_hw_prf_select
s0_out.isFastReplay := s0_sel_src.fast_rep
s0_out.isLoadReplay := s0_sel_src.ld_rep
s0_out.isFastPath := s0_sel_src.l2l_fwd
s0_out.mshrid := s0_sel_src.mshrid
2023-12-20 16:19:28 +08:00
s0_out.isvec := s0_sel_src.isvec
s0_out.is128bit := s0_sel_src.is128bit
s0_out.isFrmMisAlignBuf := s0_sel_src.frm_mabuf
2023-12-20 16:19:28 +08:00
s0_out.uop_unit_stride_fof := s0_sel_src.uop_unit_stride_fof
s0_out.paddr := io.prefetch_req.bits.paddr // only for prefetch
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// s0_out.rob_idx_valid := s0_rob_idx_valid
// s0_out.inner_idx := s0_inner_idx
// s0_out.rob_idx := s0_rob_idx
2023-12-20 16:19:28 +08:00
s0_out.reg_offset := s0_sel_src.reg_offset
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// s0_out.offset := s0_offset
s0_out.vecActive := s0_sel_src.vecActive
s0_out.usSecondInv := s0_sel_src.usSecondInv
s0_out.is_first_ele := s0_sel_src.is_first_ele
s0_out.elemIdx := s0_sel_src.elemIdx
s0_out.elemIdxInsideVd := s0_sel_src.elemIdxInsideVd
s0_out.alignedType := s0_sel_src.alignedType
s0_out.mbIndex := s0_sel_src.mbIndex
// s0_out.flowPtr := s0_sel_src.flowPtr
s0_out.uop.exceptionVec(loadAddrMisaligned) := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive
s0_out.forward_tlDchannel := s0_src_select_vec(super_rep_idx)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
when(io.tlb.req.valid && s0_sel_src.isFirstIssue) {
s0_out.uop.debugInfo.tlbFirstReqTime := GTimer()
}.otherwise{
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s0_out.uop.debugInfo.tlbFirstReqTime := s0_sel_src.uop.debugInfo.tlbFirstReqTime
}
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s0_out.schedIndex := s0_sel_src.sched_idx
// load fast replay
io.fast_rep_in.ready := (s0_can_go && io.dcache.req.ready && s0_src_ready_vec(fast_rep_idx))
2020-10-27 18:11:11 +08:00
2024-01-03 10:42:03 +08:00
// mmio
io.lsq.uncache.ready := s0_mmio_fire
// load flow source ready
// cache missed load has highest priority
// always accept cache missed load flow from load replay queue
io.replay.ready := (s0_can_go && io.dcache.req.ready && (s0_src_ready_vec(lsq_rep_idx) && !s0_rep_stall || s0_src_select_vec(super_rep_idx)))
// accept load flow from rs when:
// 1) there is no lsq-replayed load
// 2) there is no fast replayed load
// 3) there is no high confidence prefetch request
io.vecldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(vec_iss_idx)
io.ldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(int_iss_idx)
io.misalign_ldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(mab_idx)
// for hw prefetch load flow feedback, to be added later
// io.prefetch_in.ready := s0_hw_prf_select
// dcache replacement extra info
// TODO: should prefetch load update replacement?
io.dcache.replacementUpdated := Mux(s0_src_select_vec(lsq_rep_idx) || s0_src_select_vec(super_rep_idx), io.replay.bits.replacementUpdated, false.B)
// load wakeup
// TODO: vector load wakeup?
2024-07-25 11:27:22 +08:00
val s0_wakeup_selector = Seq(
s0_src_valid_vec(super_rep_idx),
s0_src_valid_vec(fast_rep_idx),
2024-07-25 11:27:22 +08:00
s0_mmio_fire,
s0_src_valid_vec(lsq_rep_idx),
s0_src_valid_vec(int_iss_idx)
2024-07-25 11:27:22 +08:00
)
val s0_wakeup_format = Seq(
io.replay.bits.uop,
io.fast_rep_in.bits.uop,
io.lsq.uncache.bits.uop,
io.replay.bits.uop,
io.ldin.bits.uop,
)
val s0_wakeup_uop = ParallelPriorityMux(s0_wakeup_selector, s0_wakeup_format)
io.wakeup.valid := s0_fire && !s0_sel_src.isvec && !s0_sel_src.frm_mabuf &&
(s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(fast_rep_idx) || s0_src_valid_vec(lsq_rep_idx) || ((s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf) && !s0_src_valid_vec(vec_iss_idx) && !s0_src_valid_vec(high_pf_idx))) || s0_mmio_fire
2024-07-25 11:27:22 +08:00
io.wakeup.bits := s0_wakeup_uop
// prefetch.i(Zicbop)
io.ifetchPrefetch.valid := RegNext(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i)
io.ifetchPrefetch.bits.vaddr := RegEnable(s0_out.vaddr, 0.U, s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i)
XSDebug(io.dcache.req.fire,
p"[DCACHE LOAD REQ] pc ${Hexadecimal(s0_sel_src.uop.pc)}, vaddr ${Hexadecimal(s0_dcache_vaddr)}\n"
2020-10-27 18:11:11 +08:00
)
XSDebug(s0_valid,
p"S0: pc ${Hexadecimal(s0_out.uop.pc)}, lId ${Hexadecimal(s0_out.uop.lqIdx.asUInt)}, " +
p"vaddr ${Hexadecimal(s0_out.vaddr)}, mask ${Hexadecimal(s0_out.mask)}\n")
// Pipeline
// --------------------------------------------------------------------------------
// stage 1
// --------------------------------------------------------------------------------
2023-07-18 11:53:47 +08:00
// TLB resp (send paddr to dcache)
val s1_valid = RegInit(false.B)
2023-07-18 11:53:47 +08:00
val s1_in = Wire(new LqWriteBundle)
val s1_out = Wire(new LqWriteBundle)
val s1_kill = Wire(Bool())
2023-07-18 11:53:47 +08:00
val s1_can_go = s2_ready
val s1_fire = s1_valid && !s1_kill && s1_can_go
val s1_vecActive = RegEnable(s0_out.vecActive, true.B, s0_fire)
2023-07-18 11:53:47 +08:00
s1_ready := !s1_valid || s1_kill || s2_ready
when (s0_fire) { s1_valid := true.B }
.elsewhen (s1_fire) { s1_valid := false.B }
.elsewhen (s1_kill) { s1_valid := false.B }
s1_in := RegEnable(s0_out, s0_fire)
val s1_fast_rep_dly_kill = RegEnable(io.fast_rep_in.bits.lateKill, io.fast_rep_in.valid) && s1_in.isFastReplay
val s1_fast_rep_dly_err = RegEnable(io.fast_rep_in.bits.delayedLoadError, io.fast_rep_in.valid) && s1_in.isFastReplay
val s1_l2l_fwd_dly_err = RegEnable(io.l2l_fwd_in.dly_ld_err, io.l2l_fwd_in.valid) && s1_in.isFastPath
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s1_dly_err = s1_fast_rep_dly_err || s1_l2l_fwd_dly_err
val s1_vaddr_hi = Wire(UInt())
val s1_vaddr_lo = Wire(UInt())
val s1_vaddr = Wire(UInt())
2023-07-18 11:53:47 +08:00
val s1_paddr_dup_lsu = Wire(UInt())
2023-09-11 09:30:43 +08:00
val s1_gpaddr_dup_lsu = Wire(UInt())
val s1_paddr_dup_dcache = Wire(UInt())
val s1_exception = ExceptionNO.selectByFu(s1_out.uop.exceptionVec, LduCfg).asUInt.orR // af & pf exception were modified below.
val s1_tlb_miss = io.tlb.resp.bits.miss && io.tlb.resp.valid && s1_valid
val s1_pbmt = Mux(io.tlb.resp.valid, io.tlb.resp.bits.pbmt(0), 0.U(2.W))
val s1_prf = s1_in.isPrefetch
val s1_hw_prf = s1_in.isHWPrefetch
val s1_sw_prf = s1_prf && !s1_hw_prf
val s1_tlb_memidx = io.tlb.resp.bits.memidx
s1_vaddr_hi := s1_in.vaddr(VAddrBits - 1, 6)
s1_vaddr_lo := s1_in.vaddr(5, 0)
s1_vaddr := Cat(s1_vaddr_hi, s1_vaddr_lo)
s1_paddr_dup_lsu := Mux(s1_hw_prf, s1_in.paddr, io.tlb.resp.bits.paddr(0))
s1_paddr_dup_dcache := Mux(s1_hw_prf, s1_in.paddr, io.tlb.resp.bits.paddr(1))
s1_gpaddr_dup_lsu := Mux(s1_hw_prf, s1_in.paddr, io.tlb.resp.bits.gpaddr(0))
when (s1_tlb_memidx.is_ld && io.tlb.resp.valid && !s1_tlb_miss && s1_tlb_memidx.idx === s1_in.uop.lqIdx.value) {
// printf("load idx = %d\n", s1_tlb_memidx.idx)
2023-07-18 11:53:47 +08:00
s1_out.uop.debugInfo.tlbRespTime := GTimer()
}
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.tlb.req_kill := s1_kill || s1_dly_err
io.tlb.req.bits.pmp_addr := s1_in.paddr
2023-07-18 11:53:47 +08:00
io.tlb.resp.ready := true.B
io.dcache.s1_paddr_dup_lsu <> s1_paddr_dup_lsu
io.dcache.s1_paddr_dup_dcache <> s1_paddr_dup_dcache
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.dcache.s1_kill := s1_kill || s1_dly_err || s1_tlb_miss || s1_exception
2020-12-13 21:31:00 +08:00
// store to load forwarding
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.sbuffer.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.sbuffer.vaddr := s1_vaddr
Sync timing modification of #1681 and #1793 (#1793) * ldu: optimize dcache hitvec wiring In previous design, hitvec is generated in load s1, then send to dcache and lsu (rs) side separately. As dcache and lsu (rs side) is far in real chip, it caused severe wiring problem. Now we generate 2 hitvec in parallel: * hitvec 1 is generated near dcache. To generate that signal, paddr from dtlb is sent to dcache in load_s1 to geerate hitvec. The hitvec is then sent to dcache to generate data array read_way_en. * hitvec 2 is generated near lsu and rs in load_s2, tag read result from dcache, as well as coh_state, is sent to lsu in load_s1, then it is used to calcuate hitvec in load_s2. hitvec 2 is used to generate hit/miss signal used by lsu. It should fix the wiring problem caused by hitvec * ldu: opt loadViolationQuery.resp.ready timing An extra release addr register is added near lsu to speed up the generation of loadViolationQuery.resp.ready * l1tlb: replace NormalPage data module and add duplicate resp result data module: add BankedSyncDataMoudleWithDup data module: divided the data array into banks and read as Async, bypass write data. RegNext the data result * #banks. choose from the chosen data. duplicate: duplicate the chosen data and return to outside(tlb). tlb return (ppn+perm) * #DUP to outside (for load unit only) TODO: load unit use different tlb resp result to different module. one for lsq, one for dcache. * l1tlb: Fix wrong vidx_bypass logic after using duplicate data module We use BankedSyncDataMoudleWithDup instead of SyncDataModuleTemplate, whose write ports are not Vec. Co-authored-by: William Wang <zeweiwang@outlook.com> Co-authored-by: ZhangZifei <1773908404@qq.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2022-09-30 14:13:58 +08:00
io.sbuffer.paddr := s1_paddr_dup_lsu
io.sbuffer.uop := s1_in.uop
io.sbuffer.sqIdx := s1_in.uop.sqIdx
2023-07-18 11:53:47 +08:00
io.sbuffer.mask := s1_in.mask
io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.lsq.forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.lsq.forward.vaddr := s1_vaddr
io.lsq.forward.paddr := s1_paddr_dup_lsu
io.lsq.forward.uop := s1_in.uop
2023-07-18 11:53:47 +08:00
io.lsq.forward.sqIdx := s1_in.uop.sqIdx
io.lsq.forward.sqIdxMask := 0.U
io.lsq.forward.mask := s1_in.mask
io.lsq.forward.pc := s1_in.uop.pc // FIXME: remove it
2020-11-02 19:23:04 +08:00
// st-ld violation query
// if store unit is 128-bits memory access, need match 128-bit
private val s1_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || (s1_in.isvec && s1_in.is128bit)))
val s1_nuke_paddr_match = VecInit((0 until StorePipelineWidth).zip(s1_isMatch128).map{case (w, s) => {Mux(s,
s1_paddr_dup_lsu(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4),
s1_paddr_dup_lsu(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}})
val s1_nuke = VecInit((0 until StorePipelineWidth).map(w => {
io.stld_nuke_query(w).valid && // query valid
isAfter(s1_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store
s1_nuke_paddr_match(w) && // paddr match
(s1_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR && !s1_tlb_miss
s1_out := s1_in
s1_out.vaddr := s1_vaddr
s1_out.paddr := s1_paddr_dup_lsu
s1_out.gpaddr := s1_gpaddr_dup_lsu
s1_out.tlbMiss := s1_tlb_miss
s1_out.ptwBack := io.tlb.resp.bits.ptwBack
s1_out.rep_info.debug := s1_in.uop.debugInfo
s1_out.rep_info.nuke := s1_nuke && !s1_sw_prf
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s1_out.delayedLoadError := s1_dly_err
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
when (!s1_dly_err) {
// current ori test will cause the case of ldest == 0, below will be modifeid in the future.
// af & pf exception were modified
s1_out.uop.exceptionVec(loadPageFault) := io.tlb.resp.bits.excp(0).pf.ld && s1_vecActive && !s1_tlb_miss
s1_out.uop.exceptionVec(loadGuestPageFault) := io.tlb.resp.bits.excp(0).gpf.ld && !s1_tlb_miss
s1_out.uop.exceptionVec(loadAccessFault) := io.tlb.resp.bits.excp(0).af.ld && s1_vecActive && !s1_tlb_miss
} .otherwise {
2023-12-20 16:19:28 +08:00
s1_out.uop.exceptionVec(loadPageFault) := false.B
s1_out.uop.exceptionVec(loadGuestPageFault) := false.B
2023-12-20 16:19:28 +08:00
s1_out.uop.exceptionVec(loadAddrMisaligned) := false.B
s1_out.uop.exceptionVec(loadAccessFault) := s1_dly_err && s1_vecActive
}
// pointer chasing
val s1_try_ptr_chasing = GatedValidRegNext(s0_do_try_ptr_chasing, false.B)
val s1_ptr_chasing_vaddr = RegEnable(s0_ptr_chasing_vaddr, s0_do_try_ptr_chasing)
val s1_fu_op_type_not_ld = WireInit(false.B)
val s1_not_fast_match = WireInit(false.B)
val s1_addr_mismatch = WireInit(false.B)
val s1_addr_misaligned = WireInit(false.B)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s1_fast_mismatch = WireInit(false.B)
val s1_ptr_chasing_canceled = WireInit(false.B)
val s1_cancel_ptr_chasing = WireInit(false.B)
val s1_redirect_reg = Wire(Valid(new Redirect))
s1_redirect_reg.bits := RegEnable(io.redirect.bits, io.redirect.valid)
s1_redirect_reg.valid := GatedValidRegNext(io.redirect.valid)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s1_kill := s1_fast_rep_dly_kill ||
s1_cancel_ptr_chasing ||
s1_in.uop.robIdx.needFlush(io.redirect) ||
(s1_in.uop.robIdx.needFlush(s1_redirect_reg) && !GatedValidRegNext(s0_try_ptr_chasing)) ||
RegEnable(s0_kill, false.B, io.ldin.valid || io.vecldin.valid || io.replay.valid || io.l2l_fwd_in.valid || io.fast_rep_in.valid || io.misalign_ldin.valid)
if (EnableLoadToLoadForward) {
// Sometimes, we need to cancel the load-load forwarding.
// These can be put at S0 if timing is bad at S1.
// Case 0: CACHE_SET(base + offset) != CACHE_SET(base) (lowest 6-bit addition has an overflow)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s1_addr_mismatch := s1_ptr_chasing_vaddr(6) ||
RegEnable(io.ld_fast_imm(11, 6).orR, s0_do_try_ptr_chasing)
// Case 1: the address is not 64-bit aligned or the fuOpType is not LD
s1_addr_misaligned := s1_ptr_chasing_vaddr(2, 0).orR
s1_fu_op_type_not_ld := io.ldin.bits.uop.fuOpType =/= LSUOpType.ld
// Case 2: this load-load uop is cancelled
s1_ptr_chasing_canceled := !io.ldin.valid
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
// Case 3: fast mismatch
s1_fast_mismatch := RegEnable(!io.ld_fast_match, s0_do_try_ptr_chasing)
when (s1_try_ptr_chasing) {
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s1_cancel_ptr_chasing := s1_addr_mismatch ||
s1_addr_misaligned ||
s1_fu_op_type_not_ld ||
s1_ptr_chasing_canceled ||
s1_fast_mismatch
s1_in.uop := io.ldin.bits.uop
s1_in.isFirstIssue := io.ldin.bits.isFirstIssue
s1_vaddr_lo := s1_ptr_chasing_vaddr(5, 0)
s1_paddr_dup_lsu := Cat(io.tlb.resp.bits.paddr(0)(PAddrBits - 1, 6), s1_vaddr_lo)
s1_paddr_dup_dcache := Cat(io.tlb.resp.bits.paddr(0)(PAddrBits - 1, 6), s1_vaddr_lo)
2021-12-20 15:32:19 +08:00
// recored tlb time when get the data to ensure the correctness of the latency calculation (although it should not record in here, because it does not use tlb)
s1_in.uop.debugInfo.tlbFirstReqTime := GTimer()
s1_in.uop.debugInfo.tlbRespTime := GTimer()
}
when (!s1_cancel_ptr_chasing) {
s0_ptr_chasing_canceled := s1_try_ptr_chasing && !io.replay.fire && !io.fast_rep_in.fire && !(s0_src_valid_vec(high_pf_idx) && io.canAcceptHighConfPrefetch) && !io.misalign_ldin.fire
when (s1_try_ptr_chasing) {
io.ldin.ready := true.B
}
}
}
// pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding
val s1_sqIdx_mask = RegEnable(UIntToMask(s0_out.uop.sqIdx.value, StoreQueueSize), s0_fire)
// to enable load-load, sqIdxMask must be calculated based on ldin.uop
// If the timing here is not OK, load-load forwarding has to be disabled.
// Or we calculate sqIdxMask at RS??
io.lsq.forward.sqIdxMask := s1_sqIdx_mask
if (EnableLoadToLoadForward) {
when (s1_try_ptr_chasing) {
io.lsq.forward.sqIdxMask := UIntToMask(io.ldin.bits.uop.sqIdx.value, StoreQueueSize)
}
}
io.forward_mshr.valid := s1_valid && s1_out.forward_tlDchannel
io.forward_mshr.mshrid := s1_out.mshrid
io.forward_mshr.paddr := s1_out.paddr
XSDebug(s1_valid,
p"S1: pc ${Hexadecimal(s1_out.uop.pc)}, lId ${Hexadecimal(s1_out.uop.lqIdx.asUInt)}, tlb_miss ${io.tlb.resp.bits.miss}, " +
p"paddr ${Hexadecimal(s1_out.paddr)}, mmio ${s1_out.mmio}\n")
// Pipeline
// --------------------------------------------------------------------------------
// stage 2
// --------------------------------------------------------------------------------
// s2: DCache resp
val s2_valid = RegInit(false.B)
val s2_in = Wire(new LqWriteBundle)
val s2_out = Wire(new LqWriteBundle)
val s2_kill = Wire(Bool())
2023-07-18 11:53:47 +08:00
val s2_can_go = s3_ready
val s2_fire = s2_valid && !s2_kill && s2_can_go
val s2_vecActive = RegEnable(s1_out.vecActive, true.B, s1_fire)
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
val s2_isvec = RegEnable(s1_out.isvec, false.B, s1_fire)
val s2_data_select = genRdataOH(s2_out.uop)
val s2_data_select_by_offset = genDataSelectByOffset(s2_out.paddr(3, 0))
val s2_frm_mabuf = s2_in.isFrmMisAlignBuf
val s2_pbmt = RegEnable(s1_pbmt, s1_fire)
s2_kill := s2_in.uop.robIdx.needFlush(io.redirect)
2023-07-18 11:53:47 +08:00
s2_ready := !s2_valid || s2_kill || s3_ready
when (s1_fire) { s2_valid := true.B }
.elsewhen (s2_fire) { s2_valid := false.B }
.elsewhen (s2_kill) { s2_valid := false.B }
s2_in := RegEnable(s1_out, s1_fire)
val s2_pmp = WireInit(io.pmp)
val s2_prf = s2_in.isPrefetch
val s2_hw_prf = s2_in.isHWPrefetch
2021-12-20 15:32:19 +08:00
// exception that may cause load addr to be invalid / illegal
// if such exception happen, that inst and its exception info
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656) each tlb's port can be configured to be block or non-blocked. For blocked port, there will be a req miss slot stored in tlb, but belong to core pipeline, which means only core pipeline flush will invalid them. For another, itlb also use PTW Filter but with only 4 entries. Last, keep svinval extension as usual, still work. * tlb: add blocked-tlb support, miss frontend changes * tlb: remove tlb's sameCycle support, result will return at next cycle * tlb: remove param ShouldBlock, move block method into TLB module * tlb: fix handle_block's miss_req logic * mmu.filter: change filter's req.ready to canEnqueue when filter can't let all the req enqueue, set the req.ready to false. canEnqueue after filtering has long latency, so we use **_fake without filtering, but the filter will still receive the reqs if it can(after filtering). * mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO * mmu: replace itlb's repeater to filter&repeaternb * mmu.tlb: add TlbStorageWrapper to make TLB cleaner more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it * mmu.tlb: rm unused param in function r_req_apply, fix syntax bug * [WIP]icache: itlb usage from non-blocked to blocked * mmu.tlb: change parameter NBWidth to Seq of boolean * icache.mainpipe: fix itlb's resp.ready, not always true * mmu.tlb: add kill sigal to blocked req that needs sync but fail in frontend, icache,itlb,next pipe may not able to sync. blocked tlb will store miss req ang blocks req, which makes itlb couldn't work. So add kill logic to let itlb not to store reqs. One more thing: fix icache's blocked tlb handling logic * icache.mainpipe: fix tlb's ready_recv logic icache mainpipe has two ports, but these two ports may not valid all the same time. So add new signals tlb_need_recv to record whether stage s1 should wait for the tlb. * tlb: when flush, just set resp.valid and pf, pf for don't use it * tlb: flush should concern satp.changed(for blocked io now) * mmu.tlb: add new flush that doesn't flush reqs Sfence.vma will flush inflight reqs and flushPipe But some other sfence(svinval...) will not. So add new flush to distinguish these two kinds of sfence signal morw: forget to assign resp result when ptw back, fix it * mmu.tlb: beautify miss_req_v and miss_v relative logic * mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB) when genPPN. by the way: some funtions need ": Unit = ", add it. * mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req * icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back Icache's mainpipe has two ports, but may only port 0 is valid. When a port is invalid, the tlbexcp should be false.(Actually, should be ignored). So & tlb_need_back to fix this bug. * sfence: instr in svinval ext will also flush pipe A difficult problem to handle: Sfence and Svinval will flush MMU, but only Sfence(some svinval) will flush pipe. For itlb that some requestors are blocked and icache doesn't recv flush for simplicity, itlb's blocked ptw req should not be flushed. It's a huge problem for MMU to handle for good or bad solutions. But svinval is seldom used, so disable it's effiency. * mmu: add parameter to control mmu's sfence delay latency Difficult problem: itlb's blocked req should not be abandoned, but sfence will flush all infight reqs. when itlb and itlb repeater's delay is not same(itlb is flushed, two cycles later, itlb repeater is flushed, then itlb's ptw req after flushing will be also flushed sliently. So add one parameter to control the flush delay to be the same. * mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire 1. csr.priv's delay csr.priv should not be delayed, csr.satp should be delayed. for excep/intr will change csr.priv, which will be changed at one instruction's (commit?). but csrrw satp will not, so satp has more cycles to delay. 2. sfence when sfence valid but blocked req fire, resp should still fire. 3. satp in TlbCsrBundle let high bits of satp.ppn to be 0.U * tlb&icache.mainpipe: rm commented codes * mmu: move method genPPN to entry bundle * l1tlb: divide l1tlb flush into flush_mmu and flush_pipe Problem: For l1tlb, there are blocked and non-blocked req ports. For blocked ports, there are req slots to store missed reqs. Some mmu flush like Sfence should not flush miss slots for outside may still need get tlb resp, no matter wrong and correct resp. For example. sfence will flush mmu and flush pipe, but won't flush reqs inside icache, which waiting for tlb resp. For example, svinval instr will flush mmu, but not flush pipe. so tlb should return correct resp, althrough the ptw req is flushed when tlb miss. Solution: divide l1tlb flush into flush_mmu and flush_pipe. The req slot is considered to be a part of core pipeline and should only be flushed by flush_pipe. flush_mmu will flush mmu entries and inflight ptw reqs. When miss but sfence flushed its ptw req, re-send. * l1tlb: code clean, correct comments and rm unused codes * l2tlb: divide filterSize into ifiterSize and dfilterSize * l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue * l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
// will be force writebacked to rob
2023-12-20 16:19:28 +08:00
val s2_exception_vec = WireInit(s2_in.uop.exceptionVec)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
when (!s2_in.delayedLoadError) {
s2_exception_vec(loadAccessFault) := (s2_in.uop.exceptionVec(loadAccessFault) ||
s2_pmp.ld ||
s2_isvec && s2_pmp.mmio && !s2_prf && !s2_in.tlbMiss ||
(io.dcache.resp.bits.tag_error && GatedValidRegNext(io.csrCtrl.cache_error_enable))
) && s2_vecActive
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
}
// soft prefetch will not trigger any exception (but ecc error interrupt may
// be triggered)
when (!s2_in.delayedLoadError && (s2_prf || s2_in.tlbMiss)) {
s2_exception_vec := 0.U.asTypeOf(s2_exception_vec.cloneType)
l1tlb: tlb's req port can be configured to be block or non-blocked (#1656) each tlb's port can be configured to be block or non-blocked. For blocked port, there will be a req miss slot stored in tlb, but belong to core pipeline, which means only core pipeline flush will invalid them. For another, itlb also use PTW Filter but with only 4 entries. Last, keep svinval extension as usual, still work. * tlb: add blocked-tlb support, miss frontend changes * tlb: remove tlb's sameCycle support, result will return at next cycle * tlb: remove param ShouldBlock, move block method into TLB module * tlb: fix handle_block's miss_req logic * mmu.filter: change filter's req.ready to canEnqueue when filter can't let all the req enqueue, set the req.ready to false. canEnqueue after filtering has long latency, so we use **_fake without filtering, but the filter will still receive the reqs if it can(after filtering). * mmu.tlb: change name from BTlbPtwIO to VectorTlbPtwIO * mmu: replace itlb's repeater to filter&repeaternb * mmu.tlb: add TlbStorageWrapper to make TLB cleaner more: BlockTlbRequestorIO is same with TlbRequestorIO, rm it * mmu.tlb: rm unused param in function r_req_apply, fix syntax bug * [WIP]icache: itlb usage from non-blocked to blocked * mmu.tlb: change parameter NBWidth to Seq of boolean * icache.mainpipe: fix itlb's resp.ready, not always true * mmu.tlb: add kill sigal to blocked req that needs sync but fail in frontend, icache,itlb,next pipe may not able to sync. blocked tlb will store miss req ang blocks req, which makes itlb couldn't work. So add kill logic to let itlb not to store reqs. One more thing: fix icache's blocked tlb handling logic * icache.mainpipe: fix tlb's ready_recv logic icache mainpipe has two ports, but these two ports may not valid all the same time. So add new signals tlb_need_recv to record whether stage s1 should wait for the tlb. * tlb: when flush, just set resp.valid and pf, pf for don't use it * tlb: flush should concern satp.changed(for blocked io now) * mmu.tlb: add new flush that doesn't flush reqs Sfence.vma will flush inflight reqs and flushPipe But some other sfence(svinval...) will not. So add new flush to distinguish these two kinds of sfence signal morw: forget to assign resp result when ptw back, fix it * mmu.tlb: beautify miss_req_v and miss_v relative logic * mmu.tlb: fix bug, when ptw back and bypass, concern level to genPPN bug: when ptw back and bypass, forgot to concern level(1GB/2MB/4KB) when genPPN. by the way: some funtions need ": Unit = ", add it. * mmu.filter: fix bug of canEnqueue, mixed with tlb_req and tlb.req * icache.mainpipe: fix bug of tlbExcp's usage, & with tlb_need_back Icache's mainpipe has two ports, but may only port 0 is valid. When a port is invalid, the tlbexcp should be false.(Actually, should be ignored). So & tlb_need_back to fix this bug. * sfence: instr in svinval ext will also flush pipe A difficult problem to handle: Sfence and Svinval will flush MMU, but only Sfence(some svinval) will flush pipe. For itlb that some requestors are blocked and icache doesn't recv flush for simplicity, itlb's blocked ptw req should not be flushed. It's a huge problem for MMU to handle for good or bad solutions. But svinval is seldom used, so disable it's effiency. * mmu: add parameter to control mmu's sfence delay latency Difficult problem: itlb's blocked req should not be abandoned, but sfence will flush all infight reqs. when itlb and itlb repeater's delay is not same(itlb is flushed, two cycles later, itlb repeater is flushed, then itlb's ptw req after flushing will be also flushed sliently. So add one parameter to control the flush delay to be the same. * mmu.tlb: fix bug of csr.priv's delay & sfence valid when req fire 1. csr.priv's delay csr.priv should not be delayed, csr.satp should be delayed. for excep/intr will change csr.priv, which will be changed at one instruction's (commit?). but csrrw satp will not, so satp has more cycles to delay. 2. sfence when sfence valid but blocked req fire, resp should still fire. 3. satp in TlbCsrBundle let high bits of satp.ppn to be 0.U * tlb&icache.mainpipe: rm commented codes * mmu: move method genPPN to entry bundle * l1tlb: divide l1tlb flush into flush_mmu and flush_pipe Problem: For l1tlb, there are blocked and non-blocked req ports. For blocked ports, there are req slots to store missed reqs. Some mmu flush like Sfence should not flush miss slots for outside may still need get tlb resp, no matter wrong and correct resp. For example. sfence will flush mmu and flush pipe, but won't flush reqs inside icache, which waiting for tlb resp. For example, svinval instr will flush mmu, but not flush pipe. so tlb should return correct resp, althrough the ptw req is flushed when tlb miss. Solution: divide l1tlb flush into flush_mmu and flush_pipe. The req slot is considered to be a part of core pipeline and should only be flushed by flush_pipe. flush_mmu will flush mmu entries and inflight ptw reqs. When miss but sfence flushed its ptw req, re-send. * l1tlb: code clean, correct comments and rm unused codes * l2tlb: divide filterSize into ifiterSize and dfilterSize * l2tlb: prefetch req won't enter miss queue. Rename MSHR to missqueue * l1tlb: when disable vm, ptw back should not bypass tlb and should let miss req go ahead
2022-07-18 09:41:17 +08:00
}
val s2_exception = ExceptionNO.selectByFu(s2_exception_vec, LduCfg).asUInt.orR && s2_vecActive
2021-12-20 15:32:19 +08:00
val (s2_fwd_frm_d_chan, s2_fwd_data_frm_d_chan) = io.tl_d_channel.forward(s1_valid && s1_out.forward_tlDchannel, s1_out.mshrid, s1_out.paddr)
val (s2_fwd_data_valid, s2_fwd_frm_mshr, s2_fwd_data_frm_mshr) = io.forward_mshr.forward()
val s2_fwd_frm_d_chan_or_mshr = s2_fwd_data_valid && (s2_fwd_frm_d_chan || s2_fwd_frm_mshr)
// writeback access fault caused by ecc error / bus error
// * ecc data error is slow to generate, so we will not use it until load stage 3
// * in load stage 3, an extra signal io.load_error will be used to
val s2_actually_mmio = s2_pmp.mmio || Pbmt.isUncache(s2_pbmt)
val s2_mmio = !s2_prf &&
s2_actually_mmio &&
!s2_exception &&
!s2_in.tlbMiss
val s2_full_fwd = Wire(Bool())
val s2_mem_amb = s2_in.uop.storeSetHit &&
io.lsq.forward.addrInvalid
val s2_tlb_miss = s2_in.tlbMiss
val s2_fwd_fail = io.lsq.forward.dataInvalid
val s2_dcache_miss = io.dcache.resp.bits.miss &&
!s2_fwd_frm_d_chan_or_mshr &&
!s2_full_fwd
val s2_mq_nack = io.dcache.s2_mq_nack &&
!s2_fwd_frm_d_chan_or_mshr &&
!s2_full_fwd
val s2_bank_conflict = io.dcache.s2_bank_conflict &&
!s2_fwd_frm_d_chan_or_mshr &&
!s2_full_fwd
val s2_wpu_pred_fail = io.dcache.s2_wpu_pred_fail &&
!s2_fwd_frm_d_chan_or_mshr &&
!s2_full_fwd
val s2_rar_nack = io.lsq.ldld_nuke_query.req.valid &&
!io.lsq.ldld_nuke_query.req.ready
val s2_raw_nack = io.lsq.stld_nuke_query.req.valid &&
!io.lsq.stld_nuke_query.req.ready
// st-ld violation query
// NeedFastRecovery Valid when
// 1. Fast recovery query request Valid.
// 2. Load instruction is younger than requestors(store instructions).
// 3. Physical address match.
// 4. Data contains.
private val s2_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || (s2_in.isvec && s2_in.is128bit)))
val s2_nuke_paddr_match = VecInit((0 until StorePipelineWidth).zip(s2_isMatch128).map{case (w, s) => {Mux(s,
s2_in.paddr(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4),
s2_in.paddr(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}})
val s2_nuke = VecInit((0 until StorePipelineWidth).map(w => {
io.stld_nuke_query(w).valid && // query valid
isAfter(s2_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store
s2_nuke_paddr_match(w) && // paddr match
(s2_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR && !s2_tlb_miss || s2_in.rep_info.nuke
val s2_cache_handled = io.dcache.resp.bits.handled
val s2_cache_tag_error = GatedValidRegNext(io.csrCtrl.cache_error_enable) &&
io.dcache.resp.bits.tag_error
val s2_troublem = !s2_exception &&
!s2_mmio &&
!s2_prf &&
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
!s2_in.delayedLoadError
io.dcache.resp.ready := true.B
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s2_dcache_should_resp = !(s2_in.tlbMiss || s2_exception || s2_in.delayedLoadError || s2_mmio || s2_prf)
assert(!(s2_valid && (s2_dcache_should_resp && !io.dcache.resp.valid)), "DCache response got lost")
// fast replay require
val s2_dcache_fast_rep = (s2_mq_nack || !s2_dcache_miss && (s2_bank_conflict || s2_wpu_pred_fail))
val s2_nuke_fast_rep = !s2_mq_nack &&
!s2_dcache_miss &&
!s2_bank_conflict &&
!s2_wpu_pred_fail &&
!s2_rar_nack &&
!s2_raw_nack &&
s2_nuke
val s2_fast_rep = !s2_mem_amb &&
!s2_tlb_miss &&
!s2_fwd_fail &&
(s2_dcache_fast_rep || s2_nuke_fast_rep) &&
s2_troublem
// need allocate new entry
val s2_can_query = !s2_mem_amb &&
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
!s2_tlb_miss &&
!s2_fwd_fail &&
!s2_frm_mabuf &&
s2_troublem
val s2_data_fwded = s2_dcache_miss && (s2_full_fwd || s2_cache_tag_error)
// ld-ld violation require
io.lsq.ldld_nuke_query.req.valid := s2_valid && s2_can_query
io.lsq.ldld_nuke_query.req.bits.uop := s2_in.uop
io.lsq.ldld_nuke_query.req.bits.mask := s2_in.mask
io.lsq.ldld_nuke_query.req.bits.paddr := s2_in.paddr
io.lsq.ldld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid, true.B, !s2_dcache_miss)
2023-07-18 11:53:47 +08:00
// st-ld violation require
io.lsq.stld_nuke_query.req.valid := s2_valid && s2_can_query
io.lsq.stld_nuke_query.req.bits.uop := s2_in.uop
io.lsq.stld_nuke_query.req.bits.mask := s2_in.mask
io.lsq.stld_nuke_query.req.bits.paddr := s2_in.paddr
io.lsq.stld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid, true.B, !s2_dcache_miss)
// merge forward result
// lsq has higher priority than sbuffer
val s2_fwd_mask = Wire(Vec((VLEN/8), Bool()))
val s2_fwd_data = Wire(Vec((VLEN/8), UInt(8.W)))
s2_full_fwd := ((~s2_fwd_mask.asUInt).asUInt & s2_in.mask) === 0.U && !io.lsq.forward.dataInvalid
// generate XLEN/8 Muxs
for (i <- 0 until VLEN / 8) {
s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i)
s2_fwd_data(i) := Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), io.sbuffer.forwardData(i))
}
2020-08-06 16:58:13 +08:00
XSDebug(s2_fire, "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n",
s2_in.uop.pc,
io.lsq.forward.forwardData.asUInt, io.lsq.forward.forwardMask.asUInt,
s2_in.forwardData.asUInt, s2_in.forwardMask.asUInt
)
2023-07-18 11:53:47 +08:00
//
s2_out := s2_in
s2_out.data := 0.U // data will be generated in load s3
s2_out.uop.fpWen := s2_in.uop.fpWen
s2_out.mmio := s2_mmio
s2_out.uop.flushPipe := false.B
s2_out.uop.exceptionVec := s2_exception_vec
s2_out.forwardMask := s2_fwd_mask
s2_out.forwardData := s2_fwd_data
s2_out.handledByMSHR := s2_cache_handled
s2_out.miss := s2_dcache_miss && s2_troublem
s2_out.feedbacked := io.feedback_fast.valid
2020-10-17 21:05:46 +08:00
// Generate replay signal caused by:
// * st-ld violation check
// * tlb miss
// * dcache replay
// * forward data invalid
// * dcache miss
s2_out.rep_info.mem_amb := s2_mem_amb && s2_troublem
s2_out.rep_info.tlb_miss := s2_tlb_miss && s2_troublem
s2_out.rep_info.fwd_fail := s2_fwd_fail && s2_troublem
s2_out.rep_info.dcache_rep := s2_mq_nack && s2_troublem
s2_out.rep_info.dcache_miss := s2_dcache_miss && s2_troublem
s2_out.rep_info.bank_conflict := s2_bank_conflict && s2_troublem
s2_out.rep_info.wpu_fail := s2_wpu_pred_fail && s2_troublem
s2_out.rep_info.rar_nack := s2_rar_nack && s2_troublem
s2_out.rep_info.raw_nack := s2_raw_nack && s2_troublem
s2_out.rep_info.nuke := s2_nuke && s2_troublem
s2_out.rep_info.full_fwd := s2_data_fwded
s2_out.rep_info.data_inv_sq_idx := io.lsq.forward.dataInvalidSqIdx
s2_out.rep_info.addr_inv_sq_idx := io.lsq.forward.addrInvalidSqIdx
s2_out.rep_info.rep_carry := io.dcache.resp.bits.replayCarry
s2_out.rep_info.mshr_id := io.dcache.resp.bits.mshr_id
s2_out.rep_info.last_beat := s2_in.paddr(log2Up(refillBytes))
s2_out.rep_info.debug := s2_in.uop.debugInfo
s2_out.rep_info.tlb_id := io.tlb_hint.id
s2_out.rep_info.tlb_full := io.tlb_hint.full
// if forward fail, replay this inst from fetch
val debug_fwd_fail_rep = s2_fwd_fail && !s2_troublem && !s2_in.tlbMiss
// if ld-ld violation is detected, replay from this inst from fetch
val debug_ldld_nuke_rep = false.B // s2_ldld_violation && !s2_mmio && !s2_is_prefetch && !s2_in.tlbMiss
// to be removed
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.feedback_fast.valid := false.B
2023-07-18 11:53:47 +08:00
io.feedback_fast.bits.hit := false.B
io.feedback_fast.bits.flushState := s2_in.ptwBack
2023-09-04 17:53:46 +08:00
io.feedback_fast.bits.robIdx := s2_in.uop.robIdx
io.feedback_fast.bits.sqIdx := s2_in.uop.sqIdx
io.feedback_fast.bits.lqIdx := s2_in.uop.lqIdx
io.feedback_fast.bits.sourceType := RSFeedbackType.lrqFull
io.feedback_fast.bits.dataInvalidSqIdx := DontCare
2024-01-03 10:42:03 +08:00
io.ldCancel.ld1Cancel := false.B
2023-09-12 01:11:11 +08:00
// fast wakeup
val s1_fast_uop_valid = WireInit(false.B)
s1_fast_uop_valid :=
2023-07-18 11:53:47 +08:00
!io.dcache.s1_disable_fast_wakeup &&
s1_valid &&
!s1_kill &&
!io.tlb.resp.bits.miss &&
!io.lsq.forward.dataInvalidFast
io.fast_uop.valid := GatedValidRegNext(s1_fast_uop_valid) && (s2_valid && !s2_out.rep_info.need_rep && !s2_mmio && !(s2_prf && !s2_hw_prf)) && !s2_isvec && !s2_frm_mabuf
io.fast_uop.bits := RegEnable(s1_out.uop, s1_fast_uop_valid)
2020-10-17 21:05:46 +08:00
2023-07-18 11:53:47 +08:00
//
io.s2_ptr_chasing := RegEnable(s1_try_ptr_chasing && !s1_cancel_ptr_chasing, false.B, s1_fire)
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
// RegNext prefetch train for better timing
// ** Now, prefetch train is valid at load s3 **
val s2_prefetch_train_valid = WireInit(false.B)
s2_prefetch_train_valid := s2_valid && !s2_actually_mmio && (!s2_in.tlbMiss || s2_hw_prf)
io.prefetch_train.valid := GatedValidRegNext(s2_prefetch_train_valid)
io.prefetch_train.bits.fromLsPipelineBundle(s2_in, latch = true, enable = s2_prefetch_train_valid)
io.prefetch_train.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_valid) // TODO: use trace with bank conflict?
io.prefetch_train.bits.meta_prefetch := RegEnable(io.dcache.resp.bits.meta_prefetch, s2_prefetch_train_valid)
io.prefetch_train.bits.meta_access := RegEnable(io.dcache.resp.bits.meta_access, s2_prefetch_train_valid)
io.s1_prefetch_spec := s1_fire
io.s2_prefetch_spec := s2_prefetch_train_valid
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s2_prefetch_train_l1_valid = WireInit(false.B)
s2_prefetch_train_l1_valid := s2_valid && !s2_actually_mmio
io.prefetch_train_l1.valid := GatedValidRegNext(s2_prefetch_train_l1_valid)
io.prefetch_train_l1.bits.fromLsPipelineBundle(s2_in, latch = true, enable = s2_prefetch_train_l1_valid)
io.prefetch_train_l1.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_l1_valid)
io.prefetch_train_l1.bits.meta_prefetch := RegEnable(io.dcache.resp.bits.meta_prefetch, s2_prefetch_train_l1_valid)
io.prefetch_train_l1.bits.meta_access := RegEnable(io.dcache.resp.bits.meta_access, s2_prefetch_train_l1_valid)
if (env.FPGAPlatform){
io.dcache.s0_pc := DontCare
io.dcache.s1_pc := DontCare
2023-02-01 18:49:47 +08:00
io.dcache.s2_pc := DontCare
}else{
io.dcache.s0_pc := s0_out.uop.pc
io.dcache.s1_pc := s1_out.uop.pc
io.dcache.s2_pc := s2_out.uop.pc
}
io.dcache.s2_kill := s2_pmp.ld || s2_actually_mmio || s2_kill
val s1_ld_left_fire = s1_valid && !s1_kill && s2_ready
val s2_ld_valid_dup = RegInit(0.U(6.W))
s2_ld_valid_dup := 0x0.U(6.W)
when (s1_ld_left_fire && !s1_out.isHWPrefetch) { s2_ld_valid_dup := 0x3f.U(6.W) }
when (s1_kill || s1_out.isHWPrefetch) { s2_ld_valid_dup := 0x0.U(6.W) }
assert(RegNext((s2_valid === s2_ld_valid_dup(0)) || RegNext(s1_out.isHWPrefetch)))
// Pipeline
// --------------------------------------------------------------------------------
// stage 3
// --------------------------------------------------------------------------------
// writeback and update load queue
val s3_valid = GatedValidRegNext(s2_valid && !s2_out.isHWPrefetch && !s2_out.uop.robIdx.needFlush(io.redirect))
val s3_in = RegEnable(s2_out, s2_fire)
val s3_out = Wire(Valid(new MemExuOutput))
val s3_dcache_rep = RegEnable(s2_dcache_fast_rep && s2_troublem, false.B, s2_fire)
val s3_ld_valid_dup = RegEnable(s2_ld_valid_dup, s2_fire)
val s3_fast_rep = Wire(Bool())
val s3_troublem = GatedValidRegNext(s2_troublem)
val s3_kill = s3_in.uop.robIdx.needFlush(io.redirect)
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
val s3_vecout = Wire(new OnlyVecExuOutput)
2024-01-03 10:42:03 +08:00
val s3_vecActive = RegEnable(s2_out.vecActive, true.B, s2_fire)
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
val s3_isvec = RegEnable(s2_out.isvec, false.B, s2_fire)
val s3_vec_alignedType = RegEnable(s2_out.alignedType, s2_fire)
val s3_vec_mBIndex = RegEnable(s2_out.mbIndex, s2_fire)
val s3_frm_mabuf = s3_in.isFrmMisAlignBuf
val s3_mmio = Wire(Valid(new MemExuOutput))
val s3_data_select = RegEnable(s2_data_select, 0.U(s2_data_select.getWidth.W), s2_fire)
val s3_data_select_by_offset = RegEnable(s2_data_select_by_offset, 0.U.asTypeOf(s2_data_select_by_offset), s2_fire)
// TODO: Fix vector load merge buffer nack
val s3_vec_mb_nack = Wire(Bool())
s3_vec_mb_nack := false.B
XSError(s3_valid && s3_vec_mb_nack, "Merge buffer should always accept vector loads!")
s3_ready := !s3_valid || s3_kill || io.ldout.ready
s3_mmio.valid := RegNextN(io.lsq.uncache.fire, 3, Some(false.B))
2024-01-03 10:42:03 +08:00
s3_mmio.bits := RegNextN(io.lsq.uncache.bits, 3)
// forwrad last beat
val (s3_fwd_frm_d_chan, s3_fwd_data_frm_d_chan) = io.tl_d_channel.forward(s2_valid && s2_out.forward_tlDchannel, s2_out.mshrid, s2_out.paddr)
val s3_fwd_data_valid = RegEnable(s2_fwd_data_valid, false.B, s2_valid)
2023-11-14 13:32:09 +08:00
val s3_fwd_frm_d_chan_valid = (s3_fwd_frm_d_chan && s3_fwd_data_valid && s3_in.handledByMSHR)
val s3_fast_rep_canceled = io.replay.valid && io.replay.bits.forward_tlDchannel || io.misalign_ldin.valid || !io.dcache.req.ready
2023-12-18 15:36:59 +08:00
// s3 load fast replay
io.fast_rep_out.valid := s3_valid && s3_fast_rep && !s3_in.uop.robIdx.needFlush(io.redirect)
2023-12-18 15:36:59 +08:00
io.fast_rep_out.bits := s3_in
io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && !s3_in.feedbacked && !s3_frm_mabuf
2023-12-18 15:36:59 +08:00
// TODO: check this --by hx
// io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || !io.fast_rep_out.ready) && !s3_in.feedbacked && !s3_in.lateKill
io.lsq.ldin.bits := s3_in
io.lsq.ldin.bits.miss := s3_in.miss && !s3_fwd_frm_d_chan_valid
// connect to misalignBuffer
io.misalign_buf.valid := io.lsq.ldin.valid && io.csrCtrl.hd_misalign_ld_enable && !io.lsq.ldin.bits.isvec
io.misalign_buf.bits := s3_in
/* <------- DANGEROUS: Don't change sequence here ! -------> */
io.lsq.ldin.bits.data_wen_dup := s3_ld_valid_dup.asBools
io.lsq.ldin.bits.replacementUpdated := io.dcache.resp.bits.replacementUpdated
io.lsq.ldin.bits.missDbUpdated := GatedValidRegNext(s2_fire && s2_in.hasROBEntry && !s2_in.tlbMiss && !s2_in.missDbUpdated)
2023-07-18 11:53:47 +08:00
val s3_dly_ld_err =
if (EnableAccurateLoadError) {
io.dcache.resp.bits.error_delayed && GatedValidRegNext(io.csrCtrl.cache_error_enable) && s3_troublem
} else {
WireInit(false.B)
}
io.s3_dly_ld_err := false.B // s3_dly_ld_err && s3_valid
io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err
val s3_vp_match_fail = GatedValidRegNext(io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid) && s3_troublem
val s3_rep_frm_fetch = s3_vp_match_fail
2023-07-18 11:53:47 +08:00
val s3_ldld_rep_inst =
io.lsq.ldld_nuke_query.resp.valid &&
io.lsq.ldld_nuke_query.resp.bits.rep_frm_fetch &&
GatedValidRegNext(io.csrCtrl.ldld_vio_check_enable)
val s3_flushPipe = s3_ldld_rep_inst
val s3_rep_info = WireInit(s3_in.rep_info)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
s3_rep_info.dcache_miss := s3_in.rep_info.dcache_miss && !s3_fwd_frm_d_chan_valid
val s3_sel_rep_cause = PriorityEncoderOH(s3_rep_info.cause.asUInt)
val s3_exception = ExceptionNO.selectByFu(s3_in.uop.exceptionVec, LduCfg).asUInt.orR && s3_vecActive
val s3_mis_align = s3_valid && s3_in.uop.exceptionVec(loadAddrMisaligned) && io.csrCtrl.hd_misalign_ld_enable && !s3_in.isvec
when (s3_exception || s3_dly_ld_err || s3_rep_frm_fetch) {
io.lsq.ldin.bits.rep_info.cause := 0.U.asTypeOf(s3_rep_info.cause.cloneType)
} .otherwise {
io.lsq.ldin.bits.rep_info.cause := VecInit(s3_sel_rep_cause.asBools)
}
2020-08-06 16:58:13 +08:00
// Int load, if hit, will be writebacked at s3
s3_out.valid := s3_valid && !io.lsq.ldin.bits.rep_info.need_rep && !s3_in.mmio
s3_out.bits.uop := s3_in.uop
s3_out.bits.uop.fpWen := s3_in.uop.fpWen && !s3_exception
s3_out.bits.uop.exceptionVec(loadAccessFault) := (s3_dly_ld_err || s3_in.uop.exceptionVec(loadAccessFault)) && s3_vecActive
2023-12-20 16:19:28 +08:00
s3_out.bits.uop.flushPipe := false.B
2023-12-22 13:24:17 +08:00
s3_out.bits.uop.replayInst := s3_rep_frm_fetch || s3_flushPipe
s3_out.bits.data := s3_in.data
s3_out.bits.debug.isMMIO := s3_in.mmio
s3_out.bits.debug.isPerfCnt := false.B
s3_out.bits.debug.paddr := s3_in.paddr
s3_out.bits.debug.vaddr := s3_in.vaddr
// Vector load, writeback to merge buffer
// TODO: Add assertion in merge buffer, merge buffer must accept vec load writeback
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
s3_vecout.isvec := s3_isvec
s3_vecout.vecdata := 0.U // Data will be assigned later
s3_vecout.mask := s3_in.mask
// s3_vecout.rob_idx_valid := s3_in.rob_idx_valid
// s3_vecout.inner_idx := s3_in.inner_idx
// s3_vecout.rob_idx := s3_in.rob_idx
// s3_vecout.offset := s3_in.offset
s3_vecout.reg_offset := s3_in.reg_offset
s3_vecout.vecActive := s3_vecActive
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
s3_vecout.is_first_ele := s3_in.is_first_ele
// s3_vecout.uopQueuePtr := DontCare // uopQueuePtr is already saved in flow queue
// s3_vecout.flowPtr := s3_in.flowPtr
s3_vecout.elemIdx := s3_in.elemIdx // elemIdx is already saved in flow queue // TODO:
s3_vecout.elemIdxInsideVd := s3_in.elemIdxInsideVd
2024-03-29 10:36:11 +08:00
val s3_usSecondInv = s3_in.usSecondInv
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.rollback.valid := s3_valid && (s3_rep_frm_fetch || s3_flushPipe) && !s3_exception
io.rollback.bits := DontCare
2023-12-20 16:19:28 +08:00
io.rollback.bits.isRVC := s3_out.bits.uop.preDecodeInfo.isRVC
io.rollback.bits.robIdx := s3_out.bits.uop.robIdx
io.rollback.bits.ftqIdx := s3_out.bits.uop.ftqPtr
io.rollback.bits.ftqOffset := s3_out.bits.uop.ftqOffset
io.rollback.bits.level := Mux(s3_rep_frm_fetch, RedirectLevel.flush, RedirectLevel.flushAfter)
io.rollback.bits.cfiUpdate.target := s3_out.bits.uop.pc
io.rollback.bits.debug_runahead_checkpoint_id := s3_out.bits.uop.debugInfo.runahead_checkpoint_id
/* <------- DANGEROUS: Don't change sequence here ! -------> */
2023-07-18 11:53:47 +08:00
io.lsq.ldin.bits.uop := s3_out.bits.uop
val s3_revoke = s3_exception || io.lsq.ldin.bits.rep_info.need_rep
io.lsq.ldld_nuke_query.revoke := s3_revoke
io.lsq.stld_nuke_query.revoke := s3_revoke
// feedback slow
s3_fast_rep := GatedValidRegNext(s2_fast_rep)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
val s3_fb_no_waiting = !s3_in.isLoadReplay &&
(!(s3_fast_rep && !s3_fast_rep_canceled)) &&
!s3_in.feedbacked
// feedback: scalar load will send feedback to RS
// vector load will send signal to VL Merge Buffer, then send feedback at granularity of uops
io.feedback_slow.valid := s3_valid && s3_fb_no_waiting && !s3_isvec && !s3_frm_mabuf
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
io.feedback_slow.bits.hit := !s3_rep_info.need_rep || io.lsq.ldin.ready
io.feedback_slow.bits.flushState := s3_in.ptwBack
2023-09-12 01:11:11 +08:00
io.feedback_slow.bits.robIdx := s3_in.uop.robIdx
io.feedback_slow.bits.sqIdx := s3_in.uop.sqIdx
io.feedback_slow.bits.lqIdx := s3_in.uop.lqIdx
io.feedback_slow.bits.sourceType := RSFeedbackType.lrqFull
io.feedback_slow.bits.dataInvalidSqIdx := DontCare
io.ldCancel.ld2Cancel := s3_valid && (
io.lsq.ldin.bits.rep_info.need_rep || // exe fail or
s3_in.mmio || // is mmio
s3_mis_align // misalign
) && !s3_isvec && !s3_frm_mabuf
2024-01-03 10:42:03 +08:00
val s3_ld_wb_meta = Mux(s3_valid, s3_out.bits, s3_mmio.bits)
// data from load queue refill
2024-01-03 10:42:03 +08:00
val s3_ld_raw_data_frm_uncache = RegNextN(io.lsq.ld_raw_data, 3)
val s3_merged_data_frm_uncache = s3_ld_raw_data_frm_uncache.mergedData()
val s3_picked_data_frm_uncache = LookupTree(s3_ld_raw_data_frm_uncache.addrOffset, List(
"b000".U -> s3_merged_data_frm_uncache(63, 0),
"b001".U -> s3_merged_data_frm_uncache(63, 8),
"b010".U -> s3_merged_data_frm_uncache(63, 16),
"b011".U -> s3_merged_data_frm_uncache(63, 24),
"b100".U -> s3_merged_data_frm_uncache(63, 32),
"b101".U -> s3_merged_data_frm_uncache(63, 40),
"b110".U -> s3_merged_data_frm_uncache(63, 48),
"b111".U -> s3_merged_data_frm_uncache(63, 56)
))
val s3_ld_data_frm_uncache = rdataHelper(s3_ld_raw_data_frm_uncache.uop, s3_picked_data_frm_uncache)
// data from dcache hit
val s3_ld_raw_data_frm_cache = Wire(new LoadDataFromDcacheBundle)
s3_ld_raw_data_frm_cache.respDcacheData := io.dcache.resp.bits.data_delayed
s3_ld_raw_data_frm_cache.forwardMask := RegEnable(s2_fwd_mask, s2_valid)
s3_ld_raw_data_frm_cache.forwardData := RegEnable(s2_fwd_data, s2_valid)
s3_ld_raw_data_frm_cache.uop := RegEnable(s2_out.uop, s2_valid)
s3_ld_raw_data_frm_cache.addrOffset := RegEnable(s2_out.paddr(3, 0), s2_valid)
s3_ld_raw_data_frm_cache.forward_D := RegEnable(s2_fwd_frm_d_chan, false.B, s2_valid) || s3_fwd_frm_d_chan_valid
s3_ld_raw_data_frm_cache.forwardData_D := Mux(s3_fwd_frm_d_chan_valid, s3_fwd_data_frm_d_chan, RegEnable(s2_fwd_data_frm_d_chan, s2_valid))
s3_ld_raw_data_frm_cache.forward_mshr := RegEnable(s2_fwd_frm_mshr, false.B, s2_valid)
s3_ld_raw_data_frm_cache.forwardData_mshr := RegEnable(s2_fwd_data_frm_mshr, s2_valid)
s3_ld_raw_data_frm_cache.forward_result_valid := RegEnable(s2_fwd_data_valid, false.B, s2_valid)
val s3_merged_data_frm_cache = s3_ld_raw_data_frm_cache.mergedData()
val s3_data_frm_cache = Seq(
s3_merged_data_frm_cache(63, 0),
s3_merged_data_frm_cache(63, 8),
s3_merged_data_frm_cache(63, 16),
s3_merged_data_frm_cache(63, 24),
s3_merged_data_frm_cache(63, 32),
s3_merged_data_frm_cache(63, 40),
s3_merged_data_frm_cache(63, 48),
s3_merged_data_frm_cache(63, 56),
s3_merged_data_frm_cache(127, 64),
s3_merged_data_frm_cache(127, 72),
s3_merged_data_frm_cache(127, 80),
s3_merged_data_frm_cache(127, 88),
s3_merged_data_frm_cache(127, 96),
s3_merged_data_frm_cache(127, 104),
s3_merged_data_frm_cache(127, 112),
s3_merged_data_frm_cache(127, 120)
)
val s3_picked_data_frm_cache = Mux1H(s3_data_select_by_offset, s3_data_frm_cache)
val s3_ld_data_frm_cache = newRdataHelper(s3_data_select, s3_picked_data_frm_cache)
// FIXME: add 1 cycle delay ?
2024-01-03 10:42:03 +08:00
// io.lsq.uncache.ready := !s3_valid
val s3_outexception = ExceptionNO.selectByFu(s3_out.bits.uop.exceptionVec, LduCfg).asUInt.orR && s3_vecActive
io.ldout.bits := s3_ld_wb_meta
io.ldout.bits.data := Mux(s3_valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache)
io.ldout.valid := ((s3_out.valid && !s3_vecout.isvec && !s3_mis_align && !s3_frm_mabuf) ||
(s3_mmio.valid && !s3_valid))
io.ldout.bits.uop.exceptionVec := ExceptionNO.selectByFu(s3_ld_wb_meta.uop.exceptionVec, LduCfg)
2023-07-18 11:53:47 +08:00
2023-12-18 15:36:59 +08:00
// TODO: check this --hx
// io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && !s3_vecout.isvec ||
// io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && !io.lsq.uncache.bits.isVls
2024-01-03 10:42:03 +08:00
// io.ldout.bits.data := Mux(s3_out.valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache)
// io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) ||
// s3_mmio.valid && !s3_mmio.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid
2023-12-18 15:36:59 +08:00
// s3 load fast replay
io.fast_rep_out.valid := s3_valid && s3_fast_rep
io.fast_rep_out.bits := s3_in
io.fast_rep_out.bits.lateKill := s3_rep_frm_fetch
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
val vecFeedback = s3_valid && s3_fb_no_waiting && s3_rep_info.need_rep && !io.lsq.ldin.ready && s3_isvec
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// vector output
io.vecldout.bits.alignedType := s3_vec_alignedType
// vec feedback
io.vecldout.bits.vecFeedback := vecFeedback
Add VLSU * miscs: optimize code style * vector: add VLSU param system and redefine vector lq io * VLUopQueue: add flow split and address generation logic * VLUopQueue: add flow issue and writeback logic * VLUopQueue: set vstart for elements with exception * VLUopQueue: handle unit-stride fof loads * VLUopQueue: implement vector masking according to vm * vector: rewrite vector store io * VlFlowQueue: add enqueue and dequeue logic * VLFlowQueue: fix some coding problem * VlFlowQueue: add issue, replay and result logic * VLFlowQueue: add redirect logic * Rob: fix compilation error * vector: remove stale codes * vector: add VSUopQueue and fix bugs for vector load * backbone: add vector load/store execution paths * VSFlowQueue: Basic function * VLUopQueue: add redirect logic for load-load violation * VSFlowQueue: fix some compile problems * VSUopQueue: add signal to indicate whether a flow is the last one * VSFlowQueue: inform scala sq when vector store finished * StoreQueue: maintain sequential retirement between scalar & vector stores * LoadQueueRAW: handle violation between vector stores & scalar loads * LDU: add vector store to scalar load forwarding * XSCore: fix writeback width of MemBlock * vector: fix load/store whole register and masked unit-stride load/store emul, evl, flownum (#2383) * VSFlowQueue: Support STLF * VLFlowQueue: fix compile bug * VSFlowQueue: fix compile problem --------- Co-authored-by: xuzefan <ceba_robot@outlook.com> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn> Co-authored-by: weidingliu <1045251744@qq.com>
2023-10-19 13:06:56 +08:00
// TODO: VLSU, uncache data logic
val vecdata = rdataVecHelper(s3_vec_alignedType(1,0), s3_picked_data_frm_cache)
2024-03-29 10:36:11 +08:00
io.vecldout.bits.vecdata.get := Mux(s3_in.is128bit, s3_merged_data_frm_cache, vecdata)
io.vecldout.bits.isvec := s3_vecout.isvec
io.vecldout.bits.elemIdx := s3_vecout.elemIdx
2024-03-29 10:36:11 +08:00
io.vecldout.bits.elemIdxInsideVd.get := s3_vecout.elemIdxInsideVd
io.vecldout.bits.mask := s3_vecout.mask
2024-03-29 10:36:11 +08:00
io.vecldout.bits.reg_offset.get := s3_vecout.reg_offset
io.vecldout.bits.usSecondInv := s3_usSecondInv
io.vecldout.bits.mBIndex := s3_vec_mBIndex
io.vecldout.bits.hit := !s3_rep_info.need_rep || io.lsq.ldin.ready
io.vecldout.bits.sourceType := RSFeedbackType.lrqFull
io.vecldout.bits.flushState := DontCare
io.vecldout.bits.exceptionVec := ExceptionNO.selectByFu(s3_out.bits.uop.exceptionVec, VlduCfg)
io.vecldout.bits.vaddr := s3_in.vaddr
2024-03-29 10:36:11 +08:00
io.vecldout.bits.mmio := DontCare
2023-12-18 15:36:59 +08:00
io.vecldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && s3_vecout.isvec ||
// TODO: check this, why !io.lsq.uncache.bits.isVls before?
io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && io.lsq.uncache.bits.isVls
//io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && !io.lsq.uncache.bits.isVls
io.misalign_ldout.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && s3_frm_mabuf
io.misalign_ldout.bits := io.lsq.ldin.bits
io.misalign_ldout.bits.data := Mux(s3_in.is128bit, s3_merged_data_frm_cache, s3_picked_data_frm_cache)
// fast load to load forward
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
if (EnableLoadToLoadForward) {
io.l2l_fwd_out.valid := s3_valid && !s3_in.mmio && !s3_rep_info.need_rep
io.l2l_fwd_out.data := Mux(s3_in.vaddr(3), s3_merged_data_frm_cache(127, 64), s3_merged_data_frm_cache(63, 0))
io.l2l_fwd_out.dly_ld_err := s3_dly_ld_err || // ecc delayed error
s3_ldld_rep_inst ||
s3_rep_frm_fetch
} else {
io.l2l_fwd_out.valid := false.B
io.l2l_fwd_out.data := DontCare
io.l2l_fwd_out.dly_ld_err := DontCare
}
// trigger
val last_valid_data = RegNext(RegEnable(io.ldout.bits.data, io.ldout.fire))
val hit_ld_addr_trig_hit_vec = Wire(Vec(TriggerNum, Bool()))
val lq_ld_addr_trig_hit_vec = io.lsq.trigger.lqLoadAddrTriggerHitVec
(0 until TriggerNum).map{i => {
val tdata2 = GatedRegNext(io.trigger(i).tdata2)
val matchType = RegNext(io.trigger(i).matchType)
val tEnable = RegNext(io.trigger(i).tEnable)
hit_ld_addr_trig_hit_vec(i) := TriggerCmp(RegEnable(s2_out.vaddr, 0.U, s2_valid), tdata2, matchType, tEnable)
io.trigger(i).addrHit := Mux(s3_out.valid, hit_ld_addr_trig_hit_vec(i), lq_ld_addr_trig_hit_vec(i))
}}
io.lsq.trigger.hitLoadAddrTriggerHitVec := hit_ld_addr_trig_hit_vec
2024-04-12 18:03:16 +08:00
// s1
io.debug_ls.s1_robIdx := s1_in.uop.robIdx.value
io.debug_ls.s1_isLoadToLoadForward := s1_fire && s1_try_ptr_chasing && !s1_ptr_chasing_canceled
io.debug_ls.s1_isTlbFirstMiss := s1_fire && s1_tlb_miss && s1_in.isFirstIssue
// s2
io.debug_ls.s2_robIdx := s2_in.uop.robIdx.value
io.debug_ls.s2_isBankConflict := s2_fire && (!s2_kill && s2_bank_conflict)
io.debug_ls.s2_isDcacheFirstMiss := s2_fire && io.dcache.resp.bits.miss && s2_in.isFirstIssue
io.debug_ls.s2_isForwardFail := s2_fire && s2_fwd_fail
// s3
io.debug_ls.s3_robIdx := s3_in.uop.robIdx.value
io.debug_ls.s3_isReplayFast := s3_valid && s3_fast_rep && !s3_fast_rep_canceled
io.debug_ls.s3_isReplayRS := RegNext(io.feedback_fast.valid && !io.feedback_fast.bits.hit) || (io.feedback_slow.valid && !io.feedback_slow.bits.hit)
io.debug_ls.s3_isReplaySlow := io.lsq.ldin.valid && io.lsq.ldin.bits.rep_info.need_rep
io.debug_ls.s3_isReplay := s3_valid && s3_rep_info.need_rep // include fast+slow+rs replay
io.debug_ls.replayCause := s3_rep_info.cause
io.debug_ls.replayCnt := 1.U
// Topdown
L1 Prefetch (#2261) * dcache: optimize the ready signal of missqueue Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue * sta, dcache: add A StorePipe in dcache When the store command passes through the sta pipeline, access the tag and meta of dcache to determine whether it hits, if it hits, update the replacement algorithm, and if miss, send a write intent to missqueue * sta prefetch: add a queue Enter this queue when the Store Address pipeline sends a request, determines that it has a cache miss, and the contention for MSHR fails. The miss request in this queue will be sent to the Store pipeline later. * sbuffer, dcache: store prefetch burst A basic implementation of "Boosting Store Buffer Efficiency with Store-Prefetch Bursts". Store prefetch at exe is disabled. Now, when store goes from sq to sbuffer, it will trigger a store prefetch; when 48 stores fall into 6 cache lines, trigger a store burst perfetch, which will bring a whole page back into dcache. * dcache: restric mshr alloc for prefetch req * restric the max number of entries which can be used by prefetch * merge two same cache line address prefetch write req * dynamically detect memset pattern, all mshr can accept prefetch when pattern is detected * spb: constantin support * dcache: fix missqueue prefetch ready * make prefetch req goes mshr with bigger id * Revert "spb: constantin support" This reverts commit 4ee50b89ba4a62cd28fa22d7fbcb2338ad4b1849. * spb: fix bug in burst generator * spb: add load prefetch burst support * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * store prefetch: refactor parameters and fix bug * change some parameters * fix store pipe bug * fix load prefetch burst * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * store prefetch: add stride and l2 prefetch * add a stride prefetcher * spb and stride will issue prefetch to l2 * when store commits, issue a prefetch to l1 * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * spf: change store prefetch structure * prefetch @ exe -> l2 cache * stride -> l2 cache * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * Revert "sbuffer: fix replaceIdx" This reverts commit 40c16aca956af9fb32554a0f12d18db41c22eecd. * spf: find best interval in stamissqueue * Revert "spf: find best interval in stamissqueue" This reverts commit d179f0ce15a5ab989a822de7fe48cc5e2cd96914. * sms: port store to sms Miss store will train sms like load. Now, sms will recieve 4 train sources, 2 for miss load, 2 for miss store, but prefetcher consume 1 train req per cycle, PrefetchTrainFilter is added to deal with this case. * bump huancun * spf: refactor structure * miss stores will train sms, and send prefetch to l2 * miss stores will send prefetch to l1 on issue or commit * spb will send prefetch to l1 * memset: fix memset detection use lqEmpty to check this * constantin: storepf constantin support cherry-pick this to use constantin in storepf * Revert "constantin: storepf constantin support" This reverts commit 2b97767b9fa757d920cac3d80d4893a1380592c7. * storepf: add EnableAtCommitMissTrigger * trigger prefetch at commit only when the store misses with EnableAtCommitMissTrigger * bump coupledl2 * prefetch req from L1 to L2 will Acquire T * fix merge conflict * storepf: do not read meta&tag when pf is disabled * storepf: do not read pcMem when sms store is disabled * fix verilog check * fix verilog * missqueue: support merging prefetch * prefetch req can be merged to pipeline reg * merging prefetch write will update cmd * delay sending out acquire when a prefetch write is about to merge * missqueue: fix bug of merging prefetch write * delay sending out acquire when a pipeline reg is about to merging a prefetch write * temp: disable store pf * missqueue: disable merging prefetch * late prefetch will be ignored * check alias when merging * enable store pf at issue * add L1StreamPrefetcher * fix assert * let prefetch req prefer loadunit1 more than 0 * stream prefetcher * disable stream component in SMS, SMS is only trained on real miss * add a prefetcher monitor to adjust depth & confidence .. * add L1 L2 stream prefetch * add gene support * Revert "add gene support" This reverts commit 59ae15640ff3d1cc96347f4d3567d48c740a03bb. * add miss db * l1pf: add stride & store source info in cache meta * add a Stride prefetcher and disable Stride component in sms * prefetch bit in meta is expanded into 3 bits to store source info of prefetcher * prefetch: support sending prefetch req to l3 * l1pf: add FDP & refactor * add basic FDP counters * change stride from Block addr to Byte addr * refactor the code * bump submodules * disable load related chiseldb to reduce db size * fix compile * fix minimalConfig & enable stream * fix stride pc problem * fix minimalconfig compile * bump submodules * refactor stream stride helper * fix compile * bump huancun * disable db to save size * fix l2 assert * bump submodules --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Haojin Tang <tanghaojin@outlook.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com>
2023-09-06 16:07:59 +08:00
io.lsTopdownInfo.s1.robIdx := s1_in.uop.robIdx.value
io.lsTopdownInfo.s1.vaddr_valid := s1_valid && s1_in.hasROBEntry
io.lsTopdownInfo.s1.vaddr_bits := s1_vaddr
io.lsTopdownInfo.s2.robIdx := s2_in.uop.robIdx.value
io.lsTopdownInfo.s2.paddr_valid := s2_fire && s2_in.hasROBEntry && !s2_in.tlbMiss
io.lsTopdownInfo.s2.paddr_bits := s2_in.paddr
io.lsTopdownInfo.s2.first_real_miss := io.dcache.resp.bits.real_miss
io.lsTopdownInfo.s2.cache_miss_en := s2_fire && s2_in.hasROBEntry && !s2_in.tlbMiss && !s2_in.missDbUpdated
// perf cnt
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s0_in_valid", io.ldin.valid)
XSPerfAccumulate("s0_in_block", io.ldin.valid && !io.ldin.fire)
XSPerfAccumulate("s0_vecin_valid", io.vecldin.valid)
XSPerfAccumulate("s0_vecin_block", io.vecldin.valid && !io.vecldin.fire)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
XSPerfAccumulate("s0_in_fire_first_issue", s0_valid && s0_sel_src.isFirstIssue)
XSPerfAccumulate("s0_lsq_replay_issue", io.replay.fire)
XSPerfAccumulate("s0_lsq_replay_vecissue", io.replay.fire && io.replay.bits.isvec)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
XSPerfAccumulate("s0_ldu_fire_first_issue", io.ldin.fire && s0_sel_src.isFirstIssue)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s0_fast_replay_issue", io.fast_rep_in.fire)
XSPerfAccumulate("s0_fast_replay_vecissue", io.fast_rep_in.fire && io.fast_rep_in.bits.isvec)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s0_stall_out", s0_valid && !s0_can_go)
XSPerfAccumulate("s0_stall_dcache", s0_valid && !io.dcache.req.ready)
XSPerfAccumulate("s0_addr_spec_success", s0_fire && s0_dcache_vaddr(VAddrBits-1, 12) === io.ldin.bits.src(0)(VAddrBits-1, 12))
XSPerfAccumulate("s0_addr_spec_failed", s0_fire && s0_dcache_vaddr(VAddrBits-1, 12) =/= io.ldin.bits.src(0)(VAddrBits-1, 12))
XSPerfAccumulate("s0_addr_spec_success_once", s0_fire && s0_dcache_vaddr(VAddrBits-1, 12) === io.ldin.bits.src(0)(VAddrBits-1, 12) && s0_sel_src.isFirstIssue)
XSPerfAccumulate("s0_addr_spec_failed_once", s0_fire && s0_dcache_vaddr(VAddrBits-1, 12) =/= io.ldin.bits.src(0)(VAddrBits-1, 12) && s0_sel_src.isFirstIssue)
XSPerfAccumulate("s0_vec_addr_vlen_aligned", s0_fire && s0_sel_src.isvec && s0_dcache_vaddr(3, 0) === 0.U)
XSPerfAccumulate("s0_vec_addr_vlen_unaligned", s0_fire && s0_sel_src.isvec && s0_dcache_vaddr(3, 0) =/= 0.U)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s0_forward_tl_d_channel", s0_out.forward_tlDchannel)
XSPerfAccumulate("s0_hardware_prefetch_fire", s0_fire && s0_hw_prf_select)
XSPerfAccumulate("s0_software_prefetch_fire", s0_fire && s0_sel_src.prf && s0_src_select_vec(int_iss_idx))
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s0_hardware_prefetch_blocked", io.prefetch_req.valid && !s0_hw_prf_select)
XSPerfAccumulate("s0_hardware_prefetch_total", io.prefetch_req.valid)
XSPerfAccumulate("s1_in_valid", s1_valid)
XSPerfAccumulate("s1_in_fire", s1_fire)
XSPerfAccumulate("s1_in_fire_first_issue", s1_fire && s1_in.isFirstIssue)
XSPerfAccumulate("s1_tlb_miss", s1_fire && s1_tlb_miss)
XSPerfAccumulate("s1_tlb_miss_first_issue", s1_fire && s1_tlb_miss && s1_in.isFirstIssue)
XSPerfAccumulate("s1_stall_out", s1_valid && !s1_can_go)
Rebase Timing Fix of Memblock from fix-timing branch (#2501) * fix LQ timing * l1pf: fix pf queue to ldu timing * disable ecc path for timing analysis * TODO: remove this * fix pipeline * memblock: add a Reg between inner/outer reset_vec * missqueue: make mem_grant always ready * Enable ECC path again * remove fast replay reorder logic * l1pf: use chosen of arbiter to improve timing * remove reorder remain logic * mq: use ParallelORR instead of orR * Strengthen the conditions for load to load path for timing * fix load to load data select for timing * refactoring lq replay valid logic * fix replay port * fix load unit s0 arbitor logic * add topdown wiring * fix ldu ecc path * remove lateKill * ecc: physically remove ecc in DataArray * loadpipe: use ParallelORR and ParallelMux for timing * mainpipe: use ParallelMux and ParallelorR for timing * fix fast replay is killed at s1 * fix replay cancel logic * fix mq nack feedback logic * sms: fix pf queue tlb req logic for timing * kill load at s1 * fix loadqueuereplay enq logic * opt raw rollback arbiter logic * fix ecc_delayed writeback logic * train all l1 pf and sms at load s3 for better timing * disable load to load forward * Revert "kill load at s1" This reverts commit 56d47582ad4dd9c83373fb2db2a0709075485d4d. * fix s0 kill logic * ITLBRepeater: Add one more buffer when PTW resp * remove trigger * fix feedback_slow logic * add latch in uncachebuffer rollback * remove trigger in port * fast replay: use dcache ready * fix replay logic at s1 * uncache: fix uncache writeback * fix delay kill logic * fix clean exception loigc at s3 * fix ldu rollback logic * fix ldu rollback valid logic --------- Co-authored-by: sfencevma <15155930562@163.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
2023-12-01 12:56:28 +08:00
XSPerfAccumulate("s1_dly_err", s1_valid && s1_fast_rep_dly_err)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s2_in_valid", s2_valid)
XSPerfAccumulate("s2_in_fire", s2_fire)
XSPerfAccumulate("s2_in_fire_first_issue", s2_fire && s2_in.isFirstIssue)
XSPerfAccumulate("s2_dcache_miss", s2_fire && io.dcache.resp.bits.miss)
XSPerfAccumulate("s2_dcache_miss_first_issue", s2_fire && io.dcache.resp.bits.miss && s2_in.isFirstIssue)
XSPerfAccumulate("s2_dcache_real_miss_first_issue", s2_fire && io.dcache.resp.bits.miss && s2_in.isFirstIssue)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s2_full_forward", s2_fire && s2_full_fwd)
XSPerfAccumulate("s2_dcache_miss_full_forward", s2_fire && s2_dcache_miss)
XSPerfAccumulate("s2_fwd_frm_d_can", s2_valid && s2_fwd_frm_d_chan)
XSPerfAccumulate("s2_fwd_frm_d_chan_or_mshr", s2_valid && s2_fwd_frm_d_chan_or_mshr)
2023-07-18 11:53:47 +08:00
XSPerfAccumulate("s2_stall_out", s2_fire && !s2_can_go)
XSPerfAccumulate("s2_prefetch", s2_fire && s2_prf)
XSPerfAccumulate("s2_prefetch_ignored", s2_fire && s2_prf && io.dcache.s2_mq_nack) // ignore prefetch for mshr full / miss req port conflict
XSPerfAccumulate("s2_prefetch_miss", s2_fire && s2_prf && io.dcache.resp.bits.miss) // prefetch req miss in l1
XSPerfAccumulate("s2_prefetch_hit", s2_fire && s2_prf && !io.dcache.resp.bits.miss) // prefetch req hit in l1
XSPerfAccumulate("s2_prefetch_accept", s2_fire && s2_prf && io.dcache.resp.bits.miss && !io.dcache.s2_mq_nack) // prefetch a missed line in l1, and l1 accepted it
XSPerfAccumulate("s2_forward_req", s2_fire && s2_in.forward_tlDchannel)
XSPerfAccumulate("s2_successfully_forward_channel_D", s2_fire && s2_fwd_frm_d_chan && s2_fwd_data_valid)
XSPerfAccumulate("s2_successfully_forward_mshr", s2_fire && s2_fwd_frm_mshr && s2_fwd_data_valid)
XSPerfAccumulate("s3_fwd_frm_d_chan", s3_valid && s3_fwd_frm_d_chan_valid)
XSPerfAccumulate("s3_frm_mabuf", s3_valid && s3_frm_mabuf)
XSPerfAccumulate("load_to_load_forward", s1_try_ptr_chasing && !s1_ptr_chasing_canceled)
XSPerfAccumulate("load_to_load_forward_try", s1_try_ptr_chasing)
XSPerfAccumulate("load_to_load_forward_fail", s1_cancel_ptr_chasing)
XSPerfAccumulate("load_to_load_forward_fail_cancelled", s1_cancel_ptr_chasing && s1_ptr_chasing_canceled)
XSPerfAccumulate("load_to_load_forward_fail_wakeup_mismatch", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && s1_not_fast_match)
XSPerfAccumulate("load_to_load_forward_fail_op_not_ld", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && !s1_not_fast_match && s1_fu_op_type_not_ld)
XSPerfAccumulate("load_to_load_forward_fail_addr_align", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && !s1_not_fast_match && !s1_fu_op_type_not_ld && s1_addr_misaligned)
XSPerfAccumulate("load_to_load_forward_fail_set_mismatch", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && !s1_not_fast_match && !s1_fu_op_type_not_ld && !s1_addr_misaligned && s1_addr_mismatch)
top-down: align top-down with Gem5 (#2085) * topdown: add defines of topdown counters enum * redirect: add redirect type for perf * top-down: add stallReason IOs frontend -> ctrlBlock -> decode -> rename -> dispatch * top-down: add dummy connections * top-down: update TopdownCounters * top-down: imp backend analysis and counter dump * top-down: add HartId in `addSource` * top-down: broadcast lqIdx of ROB head * top-down: frontend signal done * top-down: add memblock topdown interface * Bump HuanCun: add TopDownMonitor * top-down: receive and handle reasons in dispatch * top-down: remove previous top-down code * TopDown: add MemReqSource enum * TopDown: extend mshr_latency range * TopDown: add basic Req Source TODO: distinguish prefetch * dcache: distinguish L1DataPrefetch and CPUData * top-down: comment out debugging perf counters in ibuffer * TopDown: add path to pass MemReqSource to HuanCun * TopDown: use simpler logic to count reqSource and update Probe count * frontend: update topdown counters * Update HuanCun Topdown for MemReqSource * top-down: fix load stalls * top-down: Change the priority of different stall reasons * top-down: breakdown OtherCoreStall * sbuffer: fix eviction * when valid count reaches StoreBufferSize, do eviction * sbuffer: fix replaceIdx * If the way selected by the replacement algorithm cannot be written into dcache, its result is not used. * dcache, ldu: fix vaddr in missqueue This commit prevents the high bits of the virtual address from being truncated * fix-ldst_pri-230506 * mainpipe: fix loadsAreComing * top-down: disable dedup * top-down: remove old top-down config * top-down: split lq addr from ls_debug * top-down: purge previous top-down code * top-down: add debug_vaddr in LoadQueueReplay * add source rob_head_other_repay * remove load_l1_cache_stall_with/wihtou_bank_conflict * dcache: split CPUData & refill latency * split CPUData to CPUStoreData & CPULoadData & CPUAtomicData * monitor refill latency for all type of req * dcache: fix perfcounter in mq * io.req.bits.cancel should be applied when counting req.fire * TopDown: add TopDown for CPL2 in XiangShan * top-down: add hartid params to L2Cache * top-down: fix dispatch queue bound * top-down: no DqStall when robFull * topdown: buspmu support latency statistic (#2106) * perf: add buspmu between L2 and L3, support name argument * bump difftest * perf: busmonitor supports latency stat * config: fix cpl2 compatible problem * bump utility * bump coupledL2 * bump huancun * misc: adapt to utility key&field * config: fix key&field source, remove deprecated argument * buspmu: remove debug print * bump coupledl2&huancun * top-down: fix sq full condition * top-down: classify "lq full" load bound * top-down: bump submodules * bump coupledL2: fix reqSource in data path * bump coupledL2 --------- Co-authored-by: tastynoob <934348725@qq.com> Co-authored-by: Guokai Chen <chenguokai17@mails.ucas.ac.cn> Co-authored-by: lixin <1037997956@qq.com> Co-authored-by: XiChen <chenxi171@mails.ucas.ac.cn> Co-authored-by: Zhou Yaoyang <shinezyy@qq.com> Co-authored-by: Lyn <lyn@Lyns-MacBook-Pro.local> Co-authored-by: wakafa <wangkaifan@ict.ac.cn>
2023-06-02 18:27:43 +08:00
// bug lyq: some signals in perfEvents are no longer suitable for the current MemBlock design
// hardware performance counter
val perfEvents = Seq(
("load_s0_in_fire ", s0_fire ),
("load_to_load_forward ", s1_fire && s1_try_ptr_chasing && !s1_ptr_chasing_canceled ),
("stall_dcache ", s0_valid && s0_can_go && !io.dcache.req.ready ),
("load_s1_in_fire ", s0_fire ),
("load_s1_tlb_miss ", s1_fire && io.tlb.resp.bits.miss ),
("load_s2_in_fire ", s1_fire ),
("load_s2_dcache_miss ", s2_fire && io.dcache.resp.bits.miss ),
)
generatePerfEvent()
when(io.ldout.fire){
XSDebug("ldout %x\n", io.ldout.bits.uop.pc)
}
// end
}