KAFKA-405 Improve high watermark maintenance to store high watermarks for all partitions in a single .highwatermark file; patched by Neha Narkhede; reviewed by Jay Kreps and Jun Rao

git-svn-id: https://svn.apache.org/repos/asf/incubator/kafka/branches/0.8@1365841 13f79535-47bb-0310-9956-ffa450edef68
2012-07-25 23:42:07 +00:00 · 2012-07-25 23:42:07 +00:00 · b49de724f1
parent 842da7bfcd
commit b49de724f1
20 changed files with 577 additions and 296 deletions
--- a/core/src/main/scala/kafka/cluster/Partition.scala
+++ b/core/src/main/scala/kafka/cluster/Partition.scala
@ -56,7 +56,9 @@ class Partition(val topic: String,
    assignedReplicas
  }

-  def getReplica(replicaId: Int): Option[Replica] = assignedReplicas().find(_.brokerId == replicaId)
+  def getReplica(replicaId: Int): Option[Replica] = {
+    assignedReplicas().find(_.brokerId == replicaId)
+  }

  def addReplica(replica: Replica): Boolean = {
    if(!assignedReplicas.contains(replica)) {
@ -65,8 +67,7 @@ class Partition(val topic: String,
    }else false
  }

-  def updateReplicaLEO(replica: Replica, leo: Long) {
-    replica.leoUpdateTime = time.milliseconds
+  def updateReplicaLeo(replica: Replica, leo: Long) {
    replica.logEndOffset(Some(leo))
    debug("Updating the leo to %d for replica %d".format(leo, replica.brokerId))
  }
@ -108,7 +109,7 @@ class Partition(val topic: String,
    val possiblyStuckReplicas = inSyncReplicas.filter(r => r.logEndOffset() < leaderReplica().logEndOffset())
    info("Possibly stuck replicas for topic %s partition %d are %s".format(topic, partitionId,
      possiblyStuckReplicas.map(_.brokerId).mkString(",")))
-    val stuckReplicas = possiblyStuckReplicas.filter(r => r.logEndOffsetUpdateTime() < (time.milliseconds - keepInSyncTimeMs))
+    val stuckReplicas = possiblyStuckReplicas.filter(r => r.logEndOffsetUpdateTime < (time.milliseconds - keepInSyncTimeMs))
    info("Stuck replicas for topic %s partition %d are %s".format(topic, partitionId, stuckReplicas.map(_.brokerId).mkString(",")))
    val leader = leaderReplica()
    // Case 2 above
--- a/core/src/main/scala/kafka/cluster/Replica.scala
+++ b/core/src/main/scala/kafka/cluster/Replica.scala
@ -18,43 +18,38 @@
 package kafka.cluster

 import kafka.log.Log
-import kafka.utils.Logging
+import kafka.utils.{SystemTime, Time, Logging}
 import kafka.common.KafkaException

 class Replica(val brokerId: Int,
              val partition: Partition,
              val topic: String,
-              var log: Option[Log] = None,
-              var leoUpdateTime: Long = -1L) extends Logging {
+              time: Time = SystemTime,
+              var hw: Option[Long] = None,
+              var log: Option[Log] = None) extends Logging {
  private var logEndOffset: Long = -1L
+  private var logEndOffsetUpdateTimeMs: Long = -1L

  def logEndOffset(newLeo: Option[Long] = None): Long = {
    isLocal match {
      case true =>
        newLeo match {
-          case Some(newOffset) => throw new KafkaException("Trying to set the leo %d for local log".format(newOffset))
+          case Some(newOffset) => logEndOffsetUpdateTimeMs = time.milliseconds; newOffset
          case None => log.get.logEndOffset
        }
      case false =>
        newLeo match {
          case Some(newOffset) =>
            logEndOffset = newOffset
+            logEndOffsetUpdateTimeMs = time.milliseconds
+            trace("Setting log end offset for replica %d for topic %s partition %d to %d"
+              .format(brokerId, topic, partition.partitionId, logEndOffset))
            logEndOffset
          case None => logEndOffset
        }
    }
  }

-  def logEndOffsetUpdateTime(time: Option[Long] = None): Long = {
-    time match {
-      case Some(t) =>
-        leoUpdateTime = t
-        leoUpdateTime
-      case None =>
-        leoUpdateTime
-    }
-  }
-
  def isLocal: Boolean = {
    log match {
      case Some(l) => true
@ -62,6 +57,8 @@ class Replica(val brokerId: Int,
    }
  }

+  def logEndOffsetUpdateTime = logEndOffsetUpdateTimeMs
+
  def highWatermark(highwaterMarkOpt: Option[Long] = None): Long = {
    highwaterMarkOpt match {
      case Some(highwaterMark) =>
@ -69,7 +66,7 @@ class Replica(val brokerId: Int,
          case true =>
            trace("Setting hw for topic %s partition %d on broker %d to %d".format(topic, partition.partitionId,
                                                                                   brokerId, highwaterMark))
-            log.get.setHW(highwaterMark)
+            hw = Some(highwaterMark)
            highwaterMark
          case false => throw new KafkaException("Unable to set highwatermark for topic %s ".format(topic) +
            "partition %d on broker %d, since there is no local log for this partition"
@ -78,7 +75,11 @@ class Replica(val brokerId: Int,
      case None =>
        isLocal match {
          case true =>
-            log.get.getHW()
+            hw match {
+              case Some(highWatermarkValue) => highWatermarkValue
+              case None => throw new KafkaException("HighWatermark does not exist for topic %s ".format(topic) +
+              " partition %d on broker %d but local log exists".format(partition.partitionId, brokerId))
+            }
          case false => throw new KafkaException("Unable to get highwatermark for topic %s ".format(topic) +
            "partition %d on broker %d, since there is no local log for this partition"
              .format(partition.partitionId, brokerId))
--- a/core/src/main/scala/kafka/consumer/ZookeeperConsumerConnector.scala
+++ b/core/src/main/scala/kafka/consumer/ZookeeperConsumerConnector.scala
@ -93,9 +93,8 @@ private[kafka] class ZookeeperConsumerConnector(val config: ConsumerConfig,
  private var fetcher: Option[ConsumerFetcherManager] = None
  private var zkClient: ZkClient = null
  private var topicRegistry = new Pool[String, Pool[Int, PartitionTopicInfo]]
-  // topicThreadIdAndQueues : (topic,consumerThreadId) -> queue
  private val topicThreadIdAndQueues = new Pool[(String,String), BlockingQueue[FetchedDataChunk]]
-  private val scheduler = new KafkaScheduler(1, "Kafka-consumer-autocommit-", false)
+  private val scheduler = new KafkaScheduler(1)
  private val messageStreamCreated = new AtomicBoolean(false)

  private var sessionExpirationListener: ZKSessionExpireListener = null
@ -121,8 +120,10 @@ private[kafka] class ZookeeperConsumerConnector(val config: ConsumerConfig,
  connectZk()
  createFetcher()
  if (config.autoCommit) {
+    scheduler.startUp
    info("starting auto committer every " + config.autoCommitIntervalMs + " ms")
-    scheduler.scheduleWithRate(autoCommit, config.autoCommitIntervalMs, config.autoCommitIntervalMs)
+    scheduler.scheduleWithRate(autoCommit, "Kafka-consumer-autocommit-", config.autoCommitIntervalMs,
+      config.autoCommitIntervalMs, false)
  }

  def this(config: ConsumerConfig) = this(config, true)
--- a/core/src/main/scala/kafka/log/Log.scala
+++ b/core/src/main/scala/kafka/log/Log.scala
@ -18,7 +18,7 @@
 package kafka.log

 import kafka.api.OffsetRequest
-import java.io.{IOException, RandomAccessFile, File}
+import java.io.{IOException, File}
 import java.util.{Comparator, Collections, ArrayList}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong, AtomicInteger}
 import kafka.utils._
@ -105,10 +105,8 @@ class LogSegment(val file: File, val messageSet: FileMessageSet, val start: Long
   * of data to be deleted, we have to compute the offset relative to start of the log segment
   * @param offset Absolute offset for this partition
   */
-  def truncateUpto(offset: Long) = {
-    assert(offset >= start, "Offset %d used for truncating this log segment cannot be smaller than the start offset %d".
-                            format(offset, start))
-    messageSet.truncateUpto(offset - start)
+  def truncateTo(offset: Long) = {
+    messageSet.truncateTo(offset - start)
  }
 }

@ -134,13 +132,6 @@ private[kafka] class Log(val dir: File, val maxSize: Long, val flushInterval: In
  /* The actual segments of the log */
  private[log] val segments: SegmentList[LogSegment] = loadSegments()

-  /* create the leader highwatermark file handle */
-  private val hwFile = new RandomAccessFile(dir.getAbsolutePath + "/" + hwFileName, "rw")
-  info("Created highwatermark file %s for log %s".format(dir.getAbsolutePath + "/" + hwFileName, name))
-
-  /* If hw file is absent, the hw defaults to 0. If it exists, hw is set to the checkpointed value */
-  private var hw: Long = if(hwFile.length() > 0) hwFile.readLong() else { hwFile.writeLong(0); 0 }
-
  private val logStats = new LogStats(this)

  Utils.registerMBean(logStats, "kafka:type=kafka.logs." + dir.getName)
@ -221,8 +212,6 @@ private[kafka] class Log(val dir: File, val maxSize: Long, val flushInterval: In
        info("Closing log segment " + seg.file.getAbsolutePath)
        seg.messageSet.close()
      }
-      checkpointHW()
-      hwFile.close()
    }
  }

@ -363,7 +352,6 @@ private[kafka] class Log(val dir: File, val maxSize: Long, val flushInterval: In
      segments.view.last.messageSet.flush()
      unflushed.set(0)
      lastflushedTime.set(System.currentTimeMillis)
-      checkpointHW()
     }
  }

@ -435,47 +423,31 @@ private[kafka] class Log(val dir: File, val maxSize: Long, val flushInterval: In
    total
  }

-  def recoverUptoLastCheckpointedHW() {
-    if(hwFile.length() > 0) {
-      // read the last checkpointed hw from disk
-      hwFile.seek(0)
-      val lastKnownHW = hwFile.readLong()
-      info("Recovering log %s upto highwatermark %d".format(name, lastKnownHW))
+  def truncateTo(targetOffset: Long) {
      // find the log segment that has this hw
      val segmentToBeTruncated = segments.view.find(segment =>
-        lastKnownHW >= segment.start && lastKnownHW < segment.absoluteEndOffset)
+        targetOffset >= segment.start && targetOffset < segment.absoluteEndOffset)

      segmentToBeTruncated match {
        case Some(segment) =>
-          segment.truncateUpto(lastKnownHW)
-          info("Truncated log segment %s to highwatermark %d".format(segment.file.getAbsolutePath, hw))
+          val truncatedSegmentIndex = segments.view.indexOf(segment)
+          segments.truncLast(truncatedSegmentIndex)
+          segment.truncateTo(targetOffset)
+          info("Truncated log segment %s to highwatermark %d".format(segment.file.getAbsolutePath, targetOffset))
        case None =>
-          assert(lastKnownHW <= logEndOffset,
+          assert(targetOffset <= segments.view.last.absoluteEndOffset,
            "Last checkpointed hw %d cannot be greater than the latest message offset %d in the log %s".
-              format(lastKnownHW, segments.view.last.absoluteEndOffset, segments.view.last.file.getAbsolutePath))
+              format(targetOffset, segments.view.last.absoluteEndOffset, segments.view.last.file.getAbsolutePath))
          error("Cannot truncate log to %d since the log start offset is %d and end offset is %d"
-            .format(lastKnownHW, segments.view.head.start, logEndOffset))
+            .format(targetOffset, segments.view.head.start, segments.view.last.absoluteEndOffset))
      }

-      val segmentsToBeDeleted = segments.view.filter(segment => segment.start > lastKnownHW)
+      val segmentsToBeDeleted = segments.view.filter(segment => segment.start > targetOffset)
+      if(segmentsToBeDeleted.size < segments.view.size) {
      val numSegmentsDeleted = deleteSegments(segmentsToBeDeleted)
      if(numSegmentsDeleted != segmentsToBeDeleted.size)
        error("Failed to delete some segments during log recovery")
-    }else
-      info("Unable to recover log upto hw. No previously checkpointed high watermark found for " + name)
-  }
-
-  def setHW(latestLeaderHW: Long) {
-    hw = latestLeaderHW
-  }
-
-  def getHW(): Long = hw
-
-  def checkpointHW() {
-    hwFile.seek(0)
-    hwFile.writeLong(hw)
-    hwFile.getChannel.force(true)
-    info("Checkpointed highwatermark %d for log %s".format(hw, name))
+      }
  }

  def topicName():String = {
--- a/core/src/main/scala/kafka/log/LogManager.scala
+++ b/core/src/main/scala/kafka/log/LogManager.scala
@ -30,6 +30,7 @@ import kafka.common.{KafkaException, InvalidTopicException, InvalidPartitionExce
 */
@threadsafe
 private[kafka] class LogManager(val config: KafkaConfig,
+                                scheduler: KafkaScheduler,
                                private val time: Time,
                                val logCleanupIntervalMs: Long,
                                val logCleanupDefaultAgeMs: Long,
@ -40,11 +41,9 @@ private[kafka] class LogManager(val config: KafkaConfig,
  private val maxSize: Long = config.logFileSize
  private val flushInterval = config.flushInterval
  private val logCreationLock = new Object
-  private val logFlusherScheduler = new KafkaScheduler(1, "kafka-logflusher-", false)
  private val logFlushIntervals = config.flushIntervalMap
  private val logRetentionMs = config.logRetentionHoursMap.map(e => (e._1, e._2 * 60 * 60 * 1000L)) // convert hours to ms
  private val logRetentionSize = config.logRetentionSize
-  private val scheduler = new KafkaScheduler(1, "kafka-logcleaner-", false)

  /* Initialize a log for each subdirectory of the main log directory */
  private val logs = new Pool[String, Pool[Int, Log]]()
@ -76,17 +75,13 @@ private[kafka] class LogManager(val config: KafkaConfig,
  def startup() {
    /* Schedule the cleanup task to delete old logs */
    if(scheduler != null) {
-      if(scheduler.hasShutdown) {
-        println("Restarting log cleaner scheduler")
-        scheduler.startUp
-      }
-      info("starting log cleaner every " + logCleanupIntervalMs + " ms")
-      scheduler.scheduleWithRate(cleanupLogs, 60 * 1000, logCleanupIntervalMs)
+      info("Starting log cleaner every " + logCleanupIntervalMs + " ms")
+      scheduler.scheduleWithRate(cleanupLogs, "kafka-logcleaner-", 60 * 1000, logCleanupIntervalMs, false)
+      info("Starting log flusher every " + config.flushSchedulerThreadRate +
+        " ms with the following overrides " + logFlushIntervals)
+      scheduler.scheduleWithRate(flushAllLogs, "kafka-logflusher-",
+        config.flushSchedulerThreadRate, config.flushSchedulerThreadRate, false)
    }
-
-    if(logFlusherScheduler.hasShutdown) logFlusherScheduler.startUp
-    info("Starting log flusher every " + config.flushSchedulerThreadRate + " ms with the following overrides " + logFlushIntervals)
-    logFlusherScheduler.scheduleWithRate(flushAllLogs, config.flushSchedulerThreadRate, config.flushSchedulerThreadRate)
  }


@ -95,7 +90,7 @@ private[kafka] class LogManager(val config: KafkaConfig,
   */
  private def createLog(topic: String, partition: Int): Log = {
    if (topic.length <= 0)
-      throw new InvalidTopicException("topic name can't be emtpy")
+      throw new InvalidTopicException("Topic name can't be emtpy")
    if (partition < 0 || partition >= config.topicPartitionsMap.getOrElse(topic, numPartitions)) {
      val error = "Wrong partition %d, valid partitions (0, %d)."
        .format(partition, (config.topicPartitionsMap.getOrElse(topic, numPartitions) - 1))
@ -207,10 +202,8 @@ private[kafka] class LogManager(val config: KafkaConfig,
  /**
   * Close all the logs
   */
-  def close() {
+  def shutdown() {
    info("Closing log manager")
-    scheduler.shutdown()
-    logFlusherScheduler.shutdown()
    allLogs.foreach(_.close())
  }
  
@ -220,7 +213,7 @@ private[kafka] class LogManager(val config: KafkaConfig,
  def allLogs() = logs.values.flatMap(_.values)

  private def flushAllLogs() = {
-    debug("flushing the high watermark of all logs")
+    debug("Flushing the high watermark of all logs")
    for (log <- allLogs)
    {
      try{
--- a/core/src/main/scala/kafka/log/LogStats.scala
+++ b/core/src/main/scala/kafka/log/LogStats.scala
@ -36,7 +36,7 @@ class LogStats(val log: Log) extends LogStatsMBean {
  
  def getNumberOfSegments: Int = log.numberOfSegments
  
-  def getCurrentOffset: Long = log.getHW()
+  def getCurrentOffset: Long = log.logEndOffset
  
  def getNumAppendedMessages: Long = numCumulatedMessages.get

--- a/core/src/main/scala/kafka/message/FileMessageSet.scala
+++ b/core/src/main/scala/kafka/message/FileMessageSet.scala
@ -196,9 +196,12 @@ class FileMessageSet private[kafka](private[message] val channel: FileChannel,
    len - validUpTo
  }

-  def truncateUpto(hw: Long) = {
-    channel.truncate(hw)
-    setSize.set(hw)
+  def truncateTo(targetSize: Long) = {
+    if(targetSize >= sizeInBytes())
+      throw new KafkaException("Attempt to truncate log segment to %d bytes failed since the current ".format(targetSize) +
+        " size of this log segment is only %d bytes".format(sizeInBytes()))
+    channel.truncate(targetSize)
+    setSize.set(targetSize)
  }

  /**
--- a/core/src/main/scala/kafka/server/HighwaterMarkCheckpoint.scala
+++ b/core/src/main/scala/kafka/server/HighwaterMarkCheckpoint.scala
@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package kafka.server
+
+import kafka.utils.Logging
+import java.util.concurrent.locks.ReentrantLock
+import java.io._
+
+/**
+ * This class handles the read/write to the highwaterMark checkpoint file. The file stores the high watermark value for
+ * all topics and partitions that this broker hosts. The format of this file is as follows -
+ * version
+ * number of entries
+ * topic partition highwatermark
+ */
+
+object HighwaterMarkCheckpoint {
+  val highWatermarkFileName = ".highwatermark"
+  val currentHighwaterMarkFileVersion = 0
+}
+
+class HighwaterMarkCheckpoint(val path: String) extends Logging {
+  /* create the highwatermark file handle for all partitions */
+  val name = path + "/" + HighwaterMarkCheckpoint.highWatermarkFileName
+  private val hwFile = new File(name)
+  private val hwFileLock = new ReentrantLock()
+  // recover from previous tmp file, if required
+
+  def write(highwaterMarksPerPartition: Map[(String, Int), Long]) {
+    hwFileLock.lock()
+    try {
+      // write to temp file and then swap with the highwatermark file
+      val tempHwFile = new File(hwFile + ".tmp")
+      // it is an error for this file to be present. It could mean that the previous rename operation failed
+      if(tempHwFile.exists()) {
+        fatal("Temporary high watermark %s file exists. This could mean that the ".format(tempHwFile.getAbsolutePath) +
+          "previous high watermark checkpoint operation has failed.")
+        System.exit(1)
+      }
+      val hwFileWriter = new BufferedWriter(new FileWriter(tempHwFile))
+      // checkpoint highwatermark for all partitions
+      // write the current version
+      hwFileWriter.write(HighwaterMarkCheckpoint.currentHighwaterMarkFileVersion.toString)
+      hwFileWriter.newLine()
+      // write the number of entries in the highwatermark file
+      hwFileWriter.write(highwaterMarksPerPartition.size.toString)
+      hwFileWriter.newLine()
+
+      highwaterMarksPerPartition.foreach { partitionAndHw =>
+        val topic = partitionAndHw._1._1
+        val partitionId = partitionAndHw._1._2
+        hwFileWriter.write("%s %s %s".format(topic, partitionId, partitionAndHw._2))
+        hwFileWriter.newLine()
+      }
+      hwFileWriter.flush()
+      hwFileWriter.close()
+      // swap new high watermark file with previous one
+      hwFile.delete()
+      if(!tempHwFile.renameTo(hwFile)) {
+        fatal("Attempt to swap the new high watermark file with the old one failed")
+        System.exit(1)
+      }
+    }finally {
+      hwFileLock.unlock()
+    }
+  }
+
+  def read(topic: String, partition: Int): Long = {
+    hwFileLock.lock()
+    try {
+      hwFile.length() match {
+        case 0 => warn("No previously checkpointed highwatermark value found for topic %s ".format(topic) +
+          "partition %d. Returning 0 as the highwatermark".format(partition))
+        0L
+        case _ =>
+          val hwFileReader = new BufferedReader(new FileReader(hwFile))
+          val version = hwFileReader.readLine().toShort
+          version match {
+            case HighwaterMarkCheckpoint.currentHighwaterMarkFileVersion =>
+              val numberOfHighWatermarks = hwFileReader.readLine().toInt
+              val partitionHighWatermarks =
+                for(i <- 0 until numberOfHighWatermarks) yield {
+                  val nextHwEntry = hwFileReader.readLine()
+                  val partitionHwInfo = nextHwEntry.split(" ")
+                  val highwaterMark = partitionHwInfo.last.toLong
+                  val partitionId = partitionHwInfo.takeRight(2).head
+                  // find the index of partition
+                  val partitionIndex = nextHwEntry.indexOf(partitionId)
+                  val topic = nextHwEntry.substring(0, partitionIndex-1)
+                  ((topic, partitionId.toInt) -> highwaterMark)
+                }
+              hwFileReader.close()
+              val hwOpt = partitionHighWatermarks.toMap.get((topic, partition))
+              hwOpt match {
+                case Some(hw) => debug("Read hw %d for topic %s partition %d from highwatermark checkpoint file"
+                                        .format(hw, topic, partition))
+                  hw
+                case None => warn("No previously checkpointed highwatermark value found for topic %s ".format(topic) +
+                  "partition %d. Returning 0 as the highwatermark".format(partition))
+                0L
+              }
+            case _ => fatal("Unrecognized version of the highwatermark checkpoint file " + version)
+            System.exit(1)
+            -1L
+          }
+      }
+    }finally {
+      hwFileLock.unlock()
+    }
+  }
+}
--- a/core/src/main/scala/kafka/server/KafkaApis.scala
+++ b/core/src/main/scala/kafka/server/KafkaApis.scala
@ -171,7 +171,7 @@ class KafkaApis(val requestChannel: RequestChannel,
          kafkaZookeeper.ensurePartitionLeaderOnThisBroker(topicData.topic, partitionData.partition)
          val log = logManager.getOrCreateLog(topicData.topic, partitionData.partition)
          log.append(partitionData.messages.asInstanceOf[ByteBufferMessageSet])
-          replicaManager.recordLeaderLogUpdate(topicData.topic, partitionData.partition)
+          replicaManager.recordLeaderLogEndOffset(topicData.topic, partitionData.partition, log.logEndOffset)
          offsets(msgIndex) = log.logEndOffset
          errors(msgIndex) = ErrorMapping.NoError.toShort
          trace(partitionData.messages.sizeInBytes + " bytes written to logs.")
--- a/core/src/main/scala/kafka/server/KafkaServer.scala
+++ b/core/src/main/scala/kafka/server/KafkaServer.scala
@ -40,11 +40,12 @@ class KafkaServer(val config: KafkaConfig, time: Time = SystemTime) extends Logg
  private val statsMBeanName = "kafka:type=kafka.SocketServerStats"
  var socketServer: SocketServer = null
  var requestHandlerPool: KafkaRequestHandlerPool = null
-  private var logManager: LogManager = null
+  var logManager: LogManager = null
  var kafkaZookeeper: KafkaZooKeeper = null
-  private var replicaManager: ReplicaManager = null
+  var replicaManager: ReplicaManager = null
  private var apis: KafkaApis = null
  var kafkaController: KafkaController = new KafkaController(config)
+  val kafkaScheduler = new KafkaScheduler(4)
  var zkClient: ZkClient = null

  /**
@ -62,9 +63,13 @@ class KafkaServer(val config: KafkaConfig, time: Time = SystemTime) extends Logg
      cleanShutDownFile.delete
    }
    /* start client */
-    info("connecting to ZK: " + config.zkConnect)
+    info("Connecting to ZK: " + config.zkConnect)
    zkClient = KafkaZookeeperClient.getZookeeperClient(config)
+    /* start scheduler */
+    kafkaScheduler.startUp
+    /* start log manager */
    logManager = new LogManager(config,
+                                kafkaScheduler,
                                SystemTime,
                                1000L * 60 * config.logCleanupIntervalMinutes,
                                1000L * 60 * 60 * config.logRetentionHours,
@ -80,7 +85,7 @@ class KafkaServer(val config: KafkaConfig, time: Time = SystemTime) extends Logg

    kafkaZookeeper = new KafkaZooKeeper(config, zkClient, addReplica, getReplica, makeLeader, makeFollower)

-    replicaManager = new ReplicaManager(config, time, zkClient)
+    replicaManager = new ReplicaManager(config, time, zkClient, kafkaScheduler)

    apis = new KafkaApis(socketServer.requestChannel, logManager, replicaManager, kafkaZookeeper)
    requestHandlerPool = new KafkaRequestHandlerPool(socketServer.requestChannel, apis, config.numIoThreads)
@ -90,6 +95,9 @@ class KafkaServer(val config: KafkaConfig, time: Time = SystemTime) extends Logg

    // starting relevant replicas and leader election for partitions assigned to this broker
    kafkaZookeeper.startup()
+    // start the replica manager
+    replicaManager.startup()
+    // start the controller
    kafkaController.startup()

    info("Server started.")
@ -103,23 +111,21 @@ class KafkaServer(val config: KafkaConfig, time: Time = SystemTime) extends Logg
    val canShutdown = isShuttingDown.compareAndSet(false, true);
    if (canShutdown) {
      info("Shutting down Kafka server with id " + config.brokerId)
+      kafkaScheduler.shutdown()
      apis.close()
      if(replicaManager != null)
-        replicaManager.close()
+        replicaManager.shutdown()
      if (socketServer != null)
        socketServer.shutdown()
      if(requestHandlerPool != null)
        requestHandlerPool.shutdown()
      Utils.unregisterMBean(statsMBeanName)
      if(logManager != null)
-        logManager.close()
+        logManager.shutdown()
      if(kafkaController != null)
        kafkaController.shutDown()
-
-      kafkaZookeeper.close
-      info("Closing zookeeper client...")
+      kafkaZookeeper.shutdown()
      zkClient.close()
-
      val cleanShutDownFile = new File(new File(config.logDir), CleanShutdownFile)
      debug("Creating clean shutdown file " + cleanShutDownFile.getAbsolutePath())
      cleanShutDownFile.createNewFile
--- a/core/src/main/scala/kafka/server/KafkaZooKeeper.scala
+++ b/core/src/main/scala/kafka/server/KafkaZooKeeper.scala
@ -104,7 +104,7 @@ class KafkaZooKeeper(config: KafkaConfig,
    }
  }

-  def close() {
+  def shutdown() {
    stateChangeHandler.shutdown()
  }

--- a/core/src/main/scala/kafka/server/ReplicaManager.scala
+++ b/core/src/main/scala/kafka/server/ReplicaManager.scala
@ -13,7 +13,7 @@
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
-*/
+ */
 package kafka.server

 import kafka.log.Log
@ -25,21 +25,28 @@ import java.util.concurrent.locks.ReentrantLock
 import kafka.utils.{KafkaScheduler, ZkUtils, Time, Logging}
 import kafka.common.{KafkaException, InvalidPartitionException}

-class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) extends Logging {
+class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient, kafkaScheduler: KafkaScheduler)
+  extends Logging {

  private var allReplicas = new mutable.HashMap[(String, Int), Partition]()
  private var leaderReplicas = new ListBuffer[Partition]()
  private val leaderReplicaLock = new ReentrantLock()
-  private var isrExpirationScheduler = new KafkaScheduler(1, "isr-expiration-thread-", true)
  private val replicaFetcherManager = new ReplicaFetcherManager(config, this)
+  private val highwaterMarkCheckpoint = new HighwaterMarkCheckpoint(config.logDir)
+  info("Created highwatermark file %s on broker %d".format(highwaterMarkCheckpoint.name, config.brokerId))

-  // start ISR expiration thread
-  isrExpirationScheduler.startUp
-  isrExpirationScheduler.scheduleWithRate(maybeShrinkISR, 0, config.replicaMaxLagTimeMs)
+  def startup() {
+    // start the highwatermark checkpoint thread
+    kafkaScheduler.scheduleWithRate(checkpointHighwaterMarks, "highwatermark-checkpoint-thread", 0,
+      config.defaultFlushIntervalMs)
+    // start ISR expiration thread
+    kafkaScheduler.scheduleWithRate(maybeShrinkISR, "isr-expiration-thread-", 0, config.replicaMaxLagTimeMs)
+  }

  def addLocalReplica(topic: String, partitionId: Int, log: Log, assignedReplicaIds: Set[Int]): Replica = {
    val partition = getOrCreatePartition(topic, partitionId, assignedReplicaIds)
-    val localReplica = new Replica(config.brokerId, partition, topic, Some(log))
+    val localReplica = new Replica(config.brokerId, partition, topic, time,
+      Some(readCheckpointedHighWatermark(topic, partitionId)), Some(log))

    val replicaOpt = partition.getReplica(config.brokerId)
    replicaOpt match {
@ -81,12 +88,12 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
      case Some(partition) => partition
      case None =>
        throw new InvalidPartitionException("Partition for topic %s partition %d doesn't exist in replica manager on %d"
-        .format(topic, partitionId, config.brokerId))
+          .format(topic, partitionId, config.brokerId))
    }
  }

  def addRemoteReplica(topic: String, partitionId: Int, replicaId: Int, partition: Partition): Replica = {
-    val remoteReplica = new Replica(replicaId, partition, topic)
+    val remoteReplica = new Replica(replicaId, partition, topic, time)

    val replicaAdded = partition.addReplica(remoteReplica)
    if(replicaAdded)
@ -112,20 +119,17 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
        Some(replicas.leaderReplica())
      case None =>
        throw new KafkaException("Getting leader replica failed. Partition replica metadata for topic " +
-      "%s partition %d doesn't exist in Replica manager on %d".format(topic, partitionId, config.brokerId))
+          "%s partition %d doesn't exist in Replica manager on %d".format(topic, partitionId, config.brokerId))
    }
  }

-  def getPartition(topic: String, partitionId: Int): Option[Partition] =
-    allReplicas.get((topic, partitionId))
-
-  def updateReplicaLEO(replica: Replica, fetchOffset: Long) {
+  private def updateReplicaLeo(replica: Replica, fetchOffset: Long) {
    // set the replica leo
    val partition = ensurePartitionExists(replica.topic, replica.partition.partitionId)
-    partition.updateReplicaLEO(replica, fetchOffset)
+    partition.updateReplicaLeo(replica, fetchOffset)
  }

-  def maybeIncrementLeaderHW(replica: Replica) {
+  private def maybeIncrementLeaderHW(replica: Replica) {
    // set the replica leo
    val partition = ensurePartitionExists(replica.topic, replica.partition.partitionId)
    // set the leader HW to min of the leo of all replicas
@ -141,58 +145,70 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
  }

  def makeLeader(replica: Replica, currentISRInZk: Seq[Int]) {
-    // read and cache the ISR
-    replica.partition.leaderId(Some(replica.brokerId))
-    replica.partition.updateISR(currentISRInZk.toSet)
-    // stop replica fetcher thread, if any
-    replicaFetcherManager.removeFetcher(replica.topic, replica.partition.partitionId)
-    // also add this partition to the list of partitions for which the leader is the current broker
+    info("Broker %d started the leader state transition for topic %s partition %d"
+      .format(config.brokerId, replica.topic, replica.partition.partitionId))
    try {
+      // read and cache the ISR
+      replica.partition.leaderId(Some(replica.brokerId))
+      replica.partition.updateISR(currentISRInZk.toSet)
+      // stop replica fetcher thread, if any
+      replicaFetcherManager.removeFetcher(replica.topic, replica.partition.partitionId)
+      // also add this partition to the list of partitions for which the leader is the current broker
      leaderReplicaLock.lock()
      leaderReplicas += replica.partition
+      info("Broker %d completed the leader state transition for topic %s partition %d"
+        .format(config.brokerId, replica.topic, replica.partition.partitionId))
+    }catch {
+      case e => error("Broker %d failed to complete the leader state transition for topic %s partition %d"
+        .format(config.brokerId, replica.topic, replica.partition.partitionId), e)
    }finally {
      leaderReplicaLock.unlock()
    }
  }

  def makeFollower(replica: Replica, leaderBrokerId: Int, zkClient: ZkClient) {
-    info("broker %d intending to follow leader %d for topic %s partition %d"
+    info("Broker %d starting the follower state transition to follow leader %d for topic %s partition %d"
      .format(config.brokerId, leaderBrokerId, replica.topic, replica.partition.partitionId))
-    // set the leader for this partition correctly on this broker
-    replica.partition.leaderId(Some(leaderBrokerId))
-    // remove this replica's partition from the ISR expiration queue
    try {
+      // set the leader for this partition correctly on this broker
+      replica.partition.leaderId(Some(leaderBrokerId))
+      replica.log match {
+        case Some(log) =>  // log is already started
+          log.truncateTo(replica.highWatermark())
+        case None =>
+      }
+      // get leader for this replica
+      val leaderBroker = ZkUtils.getBrokerInfoFromIds(zkClient, List(leaderBrokerId)).head
+      val currentLeaderBroker = replicaFetcherManager.fetcherSourceBroker(replica.topic, replica.partition.partitionId)
+      // become follower only if it is not already following the same leader
+      if( currentLeaderBroker == None || currentLeaderBroker.get != leaderBroker.id) {
+        info("broker %d becoming follower to leader %d for topic %s partition %d"
+          .format(config.brokerId, leaderBrokerId, replica.topic, replica.partition.partitionId))
+        // stop fetcher thread to previous leader
+        replicaFetcherManager.removeFetcher(replica.topic, replica.partition.partitionId)
+        // start fetcher thread to current leader
+        replicaFetcherManager.addFetcher(replica.topic, replica.partition.partitionId, replica.logEndOffset(), leaderBroker)
+      }
+      // remove this replica's partition from the ISR expiration queue
      leaderReplicaLock.lock()
      leaderReplicas -= replica.partition
+      info("Broker %d completed the follower state transition to follow leader %d for topic %s partition %d"
+        .format(config.brokerId, leaderBrokerId, replica.topic, replica.partition.partitionId))
+    }catch {
+      case e => error("Broker %d failed to complete the follower state transition to follow leader %d for topic %s partition %d"
+        .format(config.brokerId, leaderBrokerId, replica.topic, replica.partition.partitionId), e)
    }finally {
      leaderReplicaLock.unlock()
    }
-    replica.log match {
-      case Some(log) =>  // log is already started
-        log.recoverUptoLastCheckpointedHW()
-      case None =>
-    }
-    // get leader for this replica
-    val leaderBroker = ZkUtils.getBrokerInfoFromIds(zkClient, List(leaderBrokerId)).head
-    val currentLeaderBroker = replicaFetcherManager.fetcherSourceBroker(replica.topic, replica.partition.partitionId)
-    // Become follower only if it is not already following the same leader
-    if( currentLeaderBroker == None || currentLeaderBroker.get != leaderBroker.id) {
-      info("broker %d becoming follower to leader %d for topic %s partition %d"
-        .format(config.brokerId, leaderBrokerId, replica.topic, replica.partition.partitionId))
-      // stop fetcher thread to previous leader
-      replicaFetcherManager.removeFetcher(replica.topic, replica.partition.partitionId)
-      // start fetcher thread to current leader
-      replicaFetcherManager.addFetcher(replica.topic, replica.partition.partitionId, replica.logEndOffset(), leaderBroker)
-    }
  }

-  def maybeShrinkISR(): Unit = {
+  private def maybeShrinkISR(): Unit = {
    try {
      info("Evaluating ISR list of partitions to see which replicas can be removed from the ISR"
        .format(config.replicaMaxLagTimeMs))
      leaderReplicaLock.lock()
      leaderReplicas.foreach { partition =>
-         // shrink ISR if a follower is slow or stuck
+      // shrink ISR if a follower is slow or stuck
        val outOfSyncReplicas = partition.getOutOfSyncReplicas(config.replicaMaxLagTimeMs, config.replicaMaxLagBytes)
        if(outOfSyncReplicas.size > 0) {
          val newInSyncReplicas = partition.inSyncReplicas -- outOfSyncReplicas
@ -210,7 +226,7 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
    }
  }

-  def checkIfISRCanBeExpanded(replica: Replica): Boolean = {
+  private def checkIfISRCanBeExpanded(replica: Replica): Boolean = {
    val partition = ensurePartitionExists(replica.topic, replica.partition.partitionId)
    if(partition.inSyncReplicas.contains(replica)) false
    else if(partition.assignedReplicas().contains(replica)) {
@ -225,7 +241,7 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
    val replicaOpt = getReplica(topic, partition, replicaId)
    replicaOpt match {
      case Some(replica) =>
-        updateReplicaLEO(replica, offset)
+        updateReplicaLeo(replica, offset)
        // check if this replica needs to be added to the ISR
        if(checkIfISRCanBeExpanded(replica)) {
          val newISR = replica.partition.inSyncReplicas + replica
@ -239,20 +255,45 @@ class ReplicaManager(val config: KafkaConfig, time: Time, zkClient: ZkClient) ex
    }
  }

-  def recordLeaderLogUpdate(topic: String, partition: Int) = {
+  def recordLeaderLogEndOffset(topic: String, partition: Int, logEndOffset: Long) = {
    val replicaOpt = getReplica(topic, partition, config.brokerId)
    replicaOpt match {
-      case Some(replica) =>
-        replica.logEndOffsetUpdateTime(Some(time.milliseconds))
+      case Some(replica) => replica.logEndOffset(Some(logEndOffset))
      case None =>
        throw new KafkaException("No replica %d in replica manager on %d".format(config.brokerId, config.brokerId))
    }
  }

-  def close() {
-    info("Closing replica manager on broker " + config.brokerId)
-    isrExpirationScheduler.shutdown()
+  /**
+   * Flushes the highwatermark value for all partitions to the highwatermark file
+   */
+  private def checkpointHighwaterMarks() {
+    val highwaterMarksForAllPartitions = allReplicas.map { partition =>
+      val topic = partition._1._1
+      val partitionId = partition._1._2
+      val localReplicaOpt = partition._2.getReplica(config.brokerId)
+      val hw = localReplicaOpt match {
+        case Some(localReplica) => localReplica.highWatermark()
+        case None =>
+          error("Error while checkpointing highwatermark for topic %s partition %d.".format(topic, partitionId) +
+            " Replica metadata doesn't exist in replica manager on broker " + config.brokerId)
+          0L
+      }
+      (topic, partitionId) -> hw
+    }.toMap
+    highwaterMarkCheckpoint.write(highwaterMarksForAllPartitions)
+    info("Checkpointed highwatermarks")
+  }
+
+  /**
+   * Reads the checkpointed highWatermarks for all partitions
+   * @returns checkpointed value of highwatermark for topic, partition. If one doesn't exist, returns 0
+   */
+  def readCheckpointedHighWatermark(topic: String, partition: Int): Long = highwaterMarkCheckpoint.read(topic, partition)
+
+  def shutdown() {
    replicaFetcherManager.shutdown()
+    checkpointHighwaterMarks()
    info("Replica manager shutdown on broker " + config.brokerId)
  }
 }
--- a/core/src/main/scala/kafka/utils/IteratorTemplate.scala
+++ b/core/src/main/scala/kafka/utils/IteratorTemplate.scala
@ -17,7 +17,6 @@

 package kafka.utils

-import kafka.common.KafkaException
 import java.lang.IllegalStateException

 class State
--- a/core/src/main/scala/kafka/utils/KafkaScheduler.scala
+++ b/core/src/main/scala/kafka/utils/KafkaScheduler.scala
@ -18,20 +18,24 @@
 package kafka.utils

 import java.util.concurrent._
-import java.util.concurrent.atomic._
+import atomic._
+import collection.mutable.HashMap

 /**
 * A scheduler for running jobs in the background
 */
-class KafkaScheduler(val numThreads: Int, val baseThreadName: String, isDaemon: Boolean) extends Logging {
-  private val threadId = new AtomicLong(0)
+class KafkaScheduler(val numThreads: Int) extends Logging {
  private var executor:ScheduledThreadPoolExecutor = null
-  startUp
+  private val daemonThreadFactory = new ThreadFactory() {
+      def newThread(runnable: Runnable): Thread = Utils.newThread(runnable, true)
+    }
+  private val nonDaemonThreadFactory = new ThreadFactory() {
+      def newThread(runnable: Runnable): Thread = Utils.newThread(runnable, false)
+    }
+  private val threadNamesAndIds = new HashMap[String, AtomicInteger]()

  def startUp = {
-    executor = new ScheduledThreadPoolExecutor(numThreads, new ThreadFactory() {
-      def newThread(runnable: Runnable): Thread = Utils.daemonThread(baseThreadName + threadId.getAndIncrement, runnable)
-    })
+    executor = new ScheduledThreadPoolExecutor(numThreads)
    executor.setContinueExistingPeriodicTasksAfterShutdownPolicy(false)
    executor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false)
  }
@ -43,20 +47,26 @@ class KafkaScheduler(val numThreads: Int, val baseThreadName: String, isDaemon:
      throw new IllegalStateException("Kafka scheduler has not been started")
  }

-  def scheduleWithRate(fun: () => Unit, delayMs: Long, periodMs: Long) = {
+  def scheduleWithRate(fun: () => Unit, name: String, delayMs: Long, periodMs: Long, isDaemon: Boolean = true) = {
    ensureExecutorHasStarted
-    executor.scheduleAtFixedRate(Utils.loggedRunnable(fun), delayMs, periodMs, TimeUnit.MILLISECONDS)
+    if(isDaemon)
+      executor.setThreadFactory(daemonThreadFactory)
+    else
+      executor.setThreadFactory(nonDaemonThreadFactory)
+    val threadId = threadNamesAndIds.getOrElseUpdate(name, new AtomicInteger(0))
+    executor.scheduleAtFixedRate(Utils.loggedRunnable(fun, name + threadId.incrementAndGet()), delayMs, periodMs,
+      TimeUnit.MILLISECONDS)
  }

  def shutdownNow() {
    ensureExecutorHasStarted
    executor.shutdownNow()
-    info("Forcing shutdown of scheduler " + baseThreadName)
+    info("Forcing shutdown of Kafka scheduler")
  }

  def shutdown() {
    ensureExecutorHasStarted
    executor.shutdown()
-    info("Shutdown scheduler " + baseThreadName)
+    info("Shutdown Kafka scheduler")
  }
 }
--- a/core/src/main/scala/kafka/utils/Utils.scala
+++ b/core/src/main/scala/kafka/utils/Utils.scala
@ -58,9 +58,10 @@ object Utils extends Logging {
   * @param fun A function
   * @return A Runnable that just executes the function
   */
-  def loggedRunnable(fun: () => Unit): Runnable =
+  def loggedRunnable(fun: () => Unit, name: String): Runnable =
    new Runnable() {
      def run() = {
+        Thread.currentThread().setName(name)
        try {
          fun()
        }
@ -72,6 +73,14 @@ object Utils extends Logging {
      }
    }

+  /**
+   * Create a daemon thread
+   * @param runnable The runnable to execute in the background
+   * @return The unstarted thread
+   */
+  def daemonThread(runnable: Runnable): Thread =
+    newThread(runnable, true)
+
  /**
   * Create a daemon thread
   * @param name The name of the thread
@ -108,6 +117,23 @@ object Utils extends Logging {
    thread
  }
   
+  /**
+   * Create a new thread
+   * @param runnable The work for the thread to do
+   * @param daemon Should the thread block JVM shutdown?
+   * @return The unstarted thread
+   */
+  def newThread(runnable: Runnable, daemon: Boolean): Thread = {
+    val thread = new Thread(runnable)
+    thread.setDaemon(daemon)
+    thread.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
+      def uncaughtException(t: Thread, e: Throwable) {
+        error("Uncaught exception in thread '" + t.getName + "':", e)
+      }
+    })
+    thread
+  }
+
  /**
   * Read a byte array from the given offset and size in the buffer
   * TODO: Should use System.arraycopy
--- a/core/src/test/scala/unit/kafka/log/LogManagerTest.scala
+++ b/core/src/test/scala/unit/kafka/log/LogManagerTest.scala
@ -37,6 +37,7 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
  val zookeeperConnect = TestZKUtils.zookeeperConnect
  val name = "kafka"
  val veryLargeLogFlushInterval = 10000000L
+  val scheduler = new KafkaScheduler(2)

  override def setUp() {
    super.setUp()
@ -45,7 +46,8 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
                   override val logFileSize = 1024
                   override val flushInterval = 100
                 }
-    logManager = new LogManager(config, time, veryLargeLogFlushInterval, maxLogAge, false)
+    scheduler.startUp
+    logManager = new LogManager(config, scheduler, time, veryLargeLogFlushInterval, maxLogAge, false)
    logManager.startup
    logDir = logManager.logDir

@ -56,8 +58,9 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
  }

  override def tearDown() {
+    scheduler.shutdown()
    if(logManager != null)
-      logManager.close()
+      logManager.shutdown()
    Utils.rm(logDir)
    super.tearDown()
  }
@ -114,7 +117,7 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
    val retentionHours = 1
    val retentionMs = 1000 * 60 * 60 * retentionHours
    val props = TestUtils.createBrokerConfig(0, -1)
-    logManager.close
+    logManager.shutdown()
    Thread.sleep(100)
    config = new KafkaConfig(props) {
      override val logFileSize = (10 * (setSize - 1)).asInstanceOf[Int] // each segment will be 10 messages
@ -122,7 +125,7 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
      override val logRetentionHours = retentionHours
      override val flushInterval = 100
    }
-    logManager = new LogManager(config, time, veryLargeLogFlushInterval, retentionMs, false)
+    logManager = new LogManager(config, scheduler, time, veryLargeLogFlushInterval, retentionMs, false)
    logManager.startup

    // create a log
@ -159,7 +162,7 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
  @Test
  def testTimeBasedFlush() {
    val props = TestUtils.createBrokerConfig(0, -1)
-    logManager.close
+    logManager.shutdown()
    Thread.sleep(100)
    config = new KafkaConfig(props) {
                   override val logFileSize = 1024 *1024 *1024
@ -167,7 +170,7 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
                   override val flushInterval = Int.MaxValue
                   override val flushIntervalMap = Utils.getTopicFlushIntervals("timebasedflush:100")
                 }
-    logManager = new LogManager(config, time, veryLargeLogFlushInterval, maxLogAge, false)
+    logManager = new LogManager(config, scheduler, time, veryLargeLogFlushInterval, maxLogAge, false)
    logManager.startup
    val log = logManager.getOrCreateLog(name, 0)
    for(i <- 0 until 200) {
@ -182,15 +185,14 @@ class LogManagerTest extends JUnit3Suite with ZooKeeperTestHarness {
  @Test
  def testConfigurablePartitions() {
    val props = TestUtils.createBrokerConfig(0, -1)
-    logManager.close
+    logManager.shutdown()
    Thread.sleep(100)
    config = new KafkaConfig(props) {
                   override val logFileSize = 256
                   override val topicPartitionsMap = Utils.getTopicPartitions("testPartition:2")
                   override val flushInterval = 100
                 }
-
-    logManager = new LogManager(config, time, veryLargeLogFlushInterval, maxLogAge, false)
+    logManager = new LogManager(config, scheduler, time, veryLargeLogFlushInterval, maxLogAge, false)
    logManager.startup

    for(i <- 0 until 1) {
--- a/core/src/test/scala/unit/kafka/producer/ProducerTest.scala
+++ b/core/src/test/scala/unit/kafka/producer/ProducerTest.scala
@ -13,7 +13,7 @@
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
-*/
+ */

 package kafka.producer

@ -35,7 +35,7 @@ import java.util

 class ProducerTest extends JUnit3Suite with ZooKeeperTestHarness {
  private val brokerId1 = 0
-  private val brokerId2 = 1  
+  private val brokerId2 = 1
  private val ports = TestUtils.choosePorts(2)
  private val (port1, port2) = (ports(0), ports(1))
  private var server1: KafkaServer = null
@ -80,7 +80,7 @@ class ProducerTest extends JUnit3Suite with ZooKeeperTestHarness {
    server2.shutdown
    server2.awaitShutdown()
    Utils.rm(server1.config.logDir)
-    Utils.rm(server2.config.logDir)    
+    Utils.rm(server2.config.logDir)
    Thread.sleep(500)
    super.tearDown()
  }
@ -97,7 +97,7 @@ class ProducerTest extends JUnit3Suite with ZooKeeperTestHarness {
    val props2 = new util.Properties()
    props2.putAll(props1)
    props2.put("producer.request.required.acks", "3")
-    props2.put("producer.request.ack.timeout.ms", "1000")
+    props2.put("producer.request.timeout.ms", "1000")

    val config1 = new ProducerConfig(props1)
    val config2 = new ProducerConfig(props2)
@ -108,33 +108,28 @@ class ProducerTest extends JUnit3Suite with ZooKeeperTestHarness {

    val producer1 = new Producer[String, String](config1)
    val producer2 = new Producer[String, String](config2)
-    try {
-      // Available partition ids should be 0.
-      producer1.send(new ProducerData[String, String]("new-topic", "test", Array("test1")))
-      producer1.send(new ProducerData[String, String]("new-topic", "test", Array("test1")))
-      // get the leader
-      val leaderOpt = ZkUtils.getLeaderForPartition(zkClient, "new-topic", 0)
-      assertTrue("Leader for topic new-topic partition 0 should exist", leaderOpt.isDefined)
-      val leader = leaderOpt.get
+    // Available partition ids should be 0.
+    producer1.send(new ProducerData[String, String]("new-topic", "test", Array("test1")))
+    producer1.send(new ProducerData[String, String]("new-topic", "test", Array("test1")))
+    // get the leader
+    val leaderOpt = ZkUtils.getLeaderForPartition(zkClient, "new-topic", 0)
+    assertTrue("Leader for topic new-topic partition 0 should exist", leaderOpt.isDefined)
+    val leader = leaderOpt.get

-      val messageSet = if(leader == server1.config.brokerId) {
-        val response1 = consumer1.fetch(new FetchRequestBuilder().addFetch("new-topic", 0, 0, 10000).build())
-        response1.messageSet("new-topic", 0).iterator
-      }else {
-        val response2 = consumer2.fetch(new FetchRequestBuilder().addFetch("new-topic", 0, 0, 10000).build())
-        response2.messageSet("new-topic", 0).iterator
-      }
-      assertTrue("Message set should have 1 message", messageSet.hasNext)
-
-      assertEquals(new Message("test1".getBytes), messageSet.next.message)
-      assertTrue("Message set should have 1 message", messageSet.hasNext)
-      assertEquals(new Message("test1".getBytes), messageSet.next.message)
-      assertFalse("Message set should not have any more messages", messageSet.hasNext)
-    } catch {
-      case e: Exception => fail("Not expected", e)
-    } finally {
-      producer1.close()
+    val messageSet = if(leader == server1.config.brokerId) {
+      val response1 = consumer1.fetch(new FetchRequestBuilder().addFetch("new-topic", 0, 0, 10000).build())
+      response1.messageSet("new-topic", 0).iterator
+    }else {
+      val response2 = consumer2.fetch(new FetchRequestBuilder().addFetch("new-topic", 0, 0, 10000).build())
+      response2.messageSet("new-topic", 0).iterator
    }
+    assertTrue("Message set should have 1 message", messageSet.hasNext)
+
+    assertEquals(new Message("test1".getBytes), messageSet.next.message)
+    assertTrue("Message set should have 1 message", messageSet.hasNext)
+    assertEquals(new Message("test1".getBytes), messageSet.next.message)
+    assertFalse("Message set should not have any more messages", messageSet.hasNext)
+    producer1.close()

    try {
      producer2.send(new ProducerData[String, String]("new-topic", "test", Array("test2")))
--- a/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala
+++ b/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala
@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package kafka.server
+
+import kafka.log.Log
+import org.I0Itec.zkclient.ZkClient
+import org.scalatest.junit.JUnit3Suite
+import org.easymock.EasyMock
+import org.junit.Assert._
+import kafka.utils.{KafkaScheduler, TestUtils, MockTime}
+import kafka.common.KafkaException
+
+class HighwatermarkPersistenceTest extends JUnit3Suite {
+
+  val configs = TestUtils.createBrokerConfigs(2).map(new KafkaConfig(_) {
+    override val defaultFlushIntervalMs = 100
+  })
+  val topic = "foo"
+
+  def testHighWatermarkPersistenceSinglePartition() {
+    // mock zkclient
+    val zkClient = EasyMock.createMock(classOf[ZkClient])
+    EasyMock.replay(zkClient)
+    // create kafka scheduler
+    val scheduler = new KafkaScheduler(2)
+    scheduler.startUp
+    // create replica manager
+    val replicaManager = new ReplicaManager(configs.head, new MockTime(), zkClient, scheduler)
+    replicaManager.startup()
+    // sleep until flush ms
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    var fooPartition0Hw = replicaManager.readCheckpointedHighWatermark(topic, 0)
+    assertEquals(0L, fooPartition0Hw)
+    val partition0 = replicaManager.getOrCreatePartition(topic, 0, configs.map(_.brokerId).toSet)
+    // create leader log
+    val log0 = getMockLog
+    // create leader and follower replicas
+    val leaderReplicaPartition0 = replicaManager.addLocalReplica(topic, 0, log0, configs.map(_.brokerId).toSet)
+    val followerReplicaPartition0 = replicaManager.addRemoteReplica(topic, 0, configs.last.brokerId, partition0)
+    // sleep until flush ms
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    fooPartition0Hw = replicaManager.readCheckpointedHighWatermark(topic, 0)
+    assertEquals(leaderReplicaPartition0.highWatermark(), fooPartition0Hw)
+    try {
+      followerReplicaPartition0.highWatermark()
+      fail("Should fail with IllegalStateException")
+    }catch {
+      case e: KafkaException => // this is ok
+    }
+    // set the leader
+    partition0.leaderId(Some(leaderReplicaPartition0.brokerId))
+    // set the highwatermark for local replica
+    partition0.leaderHW(Some(5L))
+    // sleep until flush interval
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    fooPartition0Hw = replicaManager.readCheckpointedHighWatermark(topic, 0)
+    assertEquals(leaderReplicaPartition0.highWatermark(), fooPartition0Hw)
+    EasyMock.verify(zkClient)
+    EasyMock.verify(log0)
+  }
+
+  def testHighWatermarkPersistenceMultiplePartitions() {
+    val topic1 = "foo1"
+    val topic2 = "foo2"
+    // mock zkclient
+    val zkClient = EasyMock.createMock(classOf[ZkClient])
+    EasyMock.replay(zkClient)
+    // create kafka scheduler
+    val scheduler = new KafkaScheduler(2)
+    scheduler.startUp
+    // create replica manager
+    val replicaManager = new ReplicaManager(configs.head, new MockTime(), zkClient, scheduler)
+    replicaManager.startup()
+    // sleep until flush ms
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    var topic1Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic1, 0)
+    assertEquals(0L, topic1Partition0Hw)
+    val topic1Partition0 = replicaManager.getOrCreatePartition(topic1, 0, configs.map(_.brokerId).toSet)
+    // create leader log
+    val topic1Log0 = getMockLog
+    // create leader and follower replicas
+    val leaderReplicaTopic1Partition0 = replicaManager.addLocalReplica(topic1, 0, topic1Log0, configs.map(_.brokerId).toSet)
+    // sleep until flush ms
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    topic1Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic1, 0)
+    assertEquals(leaderReplicaTopic1Partition0.highWatermark(), topic1Partition0Hw)
+    // set the leader
+    topic1Partition0.leaderId(Some(leaderReplicaTopic1Partition0.brokerId))
+    // set the highwatermark for local replica
+    topic1Partition0.leaderHW(Some(5L))
+    // sleep until flush interval
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    topic1Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic1, 0)
+    assertEquals(5L, leaderReplicaTopic1Partition0.highWatermark())
+    assertEquals(5L, topic1Partition0Hw)
+    // add another partition and set highwatermark
+    val topic2Partition0 = replicaManager.getOrCreatePartition(topic2, 0, configs.map(_.brokerId).toSet)
+    // create leader log
+    val topic2Log0 = getMockLog
+    // create leader and follower replicas
+    val leaderReplicaTopic2Partition0 = replicaManager.addLocalReplica(topic2, 0, topic2Log0, configs.map(_.brokerId).toSet)
+    // sleep until flush ms
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    var topic2Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic2, 0)
+    assertEquals(leaderReplicaTopic2Partition0.highWatermark(), topic2Partition0Hw)
+    // set the leader
+    topic2Partition0.leaderId(Some(leaderReplicaTopic2Partition0.brokerId))
+    // set the highwatermark for local replica
+    topic2Partition0.leaderHW(Some(15L))
+    assertEquals(15L, leaderReplicaTopic2Partition0.highWatermark())
+    // change the highwatermark for topic1
+    topic1Partition0.leaderHW(Some(10L))
+    assertEquals(10L, leaderReplicaTopic1Partition0.highWatermark())
+    // sleep until flush interval
+    Thread.sleep(configs.head.defaultFlushIntervalMs)
+    // verify checkpointed hw for topic 2
+    topic2Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic2, 0)
+    assertEquals(15L, topic2Partition0Hw)
+    // verify checkpointed hw for topic 1
+    topic1Partition0Hw = replicaManager.readCheckpointedHighWatermark(topic1, 0)
+    assertEquals(10L, topic1Partition0Hw)
+    EasyMock.verify(zkClient)
+    EasyMock.verify(topic1Log0)
+    EasyMock.verify(topic2Log0)
+  }
+
+  private def getMockLog: Log = {
+    val log = EasyMock.createMock(classOf[kafka.log.Log])
+    EasyMock.replay(log)
+    log
+  }
+}
--- a/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala
+++ b/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala
@ -22,9 +22,9 @@ import collection.mutable.Map
 import kafka.cluster.{Partition, Replica}
 import org.easymock.EasyMock
 import kafka.log.Log
-import kafka.utils.{Time, MockTime, TestUtils}
 import org.junit.Assert._
 import org.I0Itec.zkclient.ZkClient
+import kafka.utils.{KafkaScheduler, Time, MockTime, TestUtils}

 class ISRExpirationTest extends JUnit3Suite {

@ -40,7 +40,6 @@ class ISRExpirationTest extends JUnit3Suite {
    // create leader replica
    val log = EasyMock.createMock(classOf[kafka.log.Log])
    EasyMock.expect(log.logEndOffset).andReturn(5L).times(12)
-    EasyMock.expect(log.setHW(5L)).times(1)
    EasyMock.replay(log)

    // add one partition
@ -48,10 +47,10 @@ class ISRExpirationTest extends JUnit3Suite {
    assertEquals("All replicas should be in ISR", configs.map(_.brokerId).toSet, partition0.inSyncReplicas.map(_.brokerId))
    val leaderReplica = partition0.getReplica(configs.head.brokerId).get
    // set remote replicas leo to something low, like 2
-    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLEO(_, 2))
+    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLeo(_, 2))

    time.sleep(150)
-    leaderReplica.logEndOffsetUpdateTime(Some(time.milliseconds))
+    leaderReplica.logEndOffset(Some(5L))

    var partition0OSR = partition0.getOutOfSyncReplicas(configs.head.replicaMaxLagTimeMs, configs.head.replicaMaxLagBytes)
    assertEquals("Replica 1 should be out of sync", Set(configs.last.brokerId), partition0OSR.map(_.brokerId))
@ -60,9 +59,9 @@ class ISRExpirationTest extends JUnit3Suite {
    partition0.inSyncReplicas ++= partition0.assignedReplicas()
    assertEquals("Replica 1 should be in sync", configs.map(_.brokerId).toSet, partition0.inSyncReplicas.map(_.brokerId))

-    leaderReplica.logEndOffsetUpdateTime(Some(time.milliseconds))
+    leaderReplica.logEndOffset(Some(5L))
    // let the follower catch up only upto 3
-    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLEO(_, 3))
+    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLeo(_, 3))
    time.sleep(150)
    // now follower broker id 1 has caught upto only 3, while the leader is at 5 AND follower broker id 1 hasn't
    // pulled any data for > replicaMaxLagTimeMs ms. So it is stuck
@ -80,12 +79,12 @@ class ISRExpirationTest extends JUnit3Suite {
    assertEquals("All replicas should be in ISR", configs.map(_.brokerId).toSet, partition0.inSyncReplicas.map(_.brokerId))
    val leaderReplica = partition0.getReplica(configs.head.brokerId).get
    // set remote replicas leo to something low, like 4
-    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLEO(_, 4))
+    (partition0.assignedReplicas() - leaderReplica).foreach(partition0.updateReplicaLeo(_, 4))

    time.sleep(150)
-    leaderReplica.logEndOffsetUpdateTime(Some(time.milliseconds))
+    leaderReplica.logEndOffset(Some(15L))
    time.sleep(10)
-    (partition0.inSyncReplicas - leaderReplica).foreach(r => r.logEndOffsetUpdateTime(Some(time.milliseconds)))
+    (partition0.inSyncReplicas - leaderReplica).foreach(r => r.logEndOffset(Some(4)))

    val partition0OSR = partition0.getOutOfSyncReplicas(configs.head.replicaMaxLagTimeMs, configs.head.replicaMaxLagBytes)
    assertEquals("Replica 1 should be out of sync", Set(configs.last.brokerId), partition0OSR.map(_.brokerId))
@ -98,8 +97,10 @@ class ISRExpirationTest extends JUnit3Suite {
    // mock zkclient
    val zkClient = EasyMock.createMock(classOf[ZkClient])
    EasyMock.replay(zkClient)
+    // create kafka scheduler
+    val scheduler = new KafkaScheduler(2)
    // create replica manager
-    val replicaManager = new ReplicaManager(configs.head, time, zkClient)
+    val replicaManager = new ReplicaManager(configs.head, time, zkClient, scheduler)
    try {
      val partition0 = replicaManager.getOrCreatePartition(topic, 0, configs.map(_.brokerId).toSet)
      // create leader log
@ -115,7 +116,7 @@ class ISRExpirationTest extends JUnit3Suite {
      partition0.leaderHW(Some(5L))

      // set the leo for non-leader replicas to something low
-      (partition0.assignedReplicas() - leaderReplicaPartition0).foreach(r => partition0.updateReplicaLEO(r, 2))
+      (partition0.assignedReplicas() - leaderReplicaPartition0).foreach(r => partition0.updateReplicaLeo(r, 2))

      val log1 = getLogWithHW(15L)
      // create leader and follower replicas for partition 1
@ -129,13 +130,13 @@ class ISRExpirationTest extends JUnit3Suite {
      partition1.leaderHW(Some(15L))

      // set the leo for non-leader replicas to something low
-      (partition1.assignedReplicas() - leaderReplicaPartition1).foreach(r => partition1.updateReplicaLEO(r, 4))
+      (partition1.assignedReplicas() - leaderReplicaPartition1).foreach(r => partition1.updateReplicaLeo(r, 4))

      time.sleep(150)
-      leaderReplicaPartition0.logEndOffsetUpdateTime(Some(time.milliseconds))
-      leaderReplicaPartition1.logEndOffsetUpdateTime(Some(time.milliseconds))
+      leaderReplicaPartition0.logEndOffset(Some(4L))
+      leaderReplicaPartition1.logEndOffset(Some(4L))
      time.sleep(10)
-      (partition1.inSyncReplicas - leaderReplicaPartition1).foreach(r => r.logEndOffsetUpdateTime(Some(time.milliseconds)))
+      (partition1.inSyncReplicas - leaderReplicaPartition1).foreach(r => r.logEndOffset(Some(4L)))

      val partition0OSR = partition0.getOutOfSyncReplicas(configs.head.replicaMaxLagTimeMs, configs.head.replicaMaxLagBytes)
      assertEquals("Replica 1 should be out of sync", Set(configs.last.brokerId), partition0OSR.map(_.brokerId))
@ -148,16 +149,16 @@ class ISRExpirationTest extends JUnit3Suite {
    }catch {
      case e => e.printStackTrace()
    }finally {
-      replicaManager.close()
+      replicaManager.shutdown()
    }
  }

  private def getPartitionWithAllReplicasInISR(topic: String, partitionId: Int, time: Time, leaderId: Int,
                                               localLog: Log, leaderHW: Long): Partition = {
    val partition = new Partition(topic, partitionId, time)
-    val leaderReplica = new Replica(leaderId, partition, topic, Some(localLog))
+    val leaderReplica = new Replica(leaderId, partition, topic, time, Some(leaderHW), Some(localLog))

-    val allReplicas = getFollowerReplicas(partition, leaderId) :+ leaderReplica
+    val allReplicas = getFollowerReplicas(partition, leaderId, time) :+ leaderReplica
    partition.assignedReplicas(Some(allReplicas.toSet))
    // set in sync replicas for this partition to all the assigned replicas
    partition.inSyncReplicas = allReplicas.toSet
@ -170,15 +171,14 @@ class ISRExpirationTest extends JUnit3Suite {
  private def getLogWithHW(hw: Long): Log = {
    val log1 = EasyMock.createMock(classOf[kafka.log.Log])
    EasyMock.expect(log1.logEndOffset).andReturn(hw).times(6)
-    EasyMock.expect(log1.setHW(hw)).times(1)
    EasyMock.replay(log1)

    log1
  }

-  private def getFollowerReplicas(partition: Partition, leaderId: Int): Seq[Replica] = {
+  private def getFollowerReplicas(partition: Partition, leaderId: Int, time: Time): Seq[Replica] = {
    configs.filter(_.brokerId != leaderId).map { config =>
-      new Replica(config.brokerId, partition, topic)
+      new Replica(config.brokerId, partition, topic, time)
    }
  }
 }
--- a/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala
+++ b/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala
@ -7,7 +7,6 @@ import kafka.utils.TestUtils._
 import kafka.utils.{Utils, TestUtils}
 import kafka.zk.ZooKeeperTestHarness
 import kafka.message.Message
-import java.io.RandomAccessFile
 import kafka.producer.{ProducerConfig, ProducerData, Producer}
 import org.junit.Test

@ -34,15 +33,12 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
  val configProps1 = configs.head
  val configProps2 = configs.last

-  val server1HWFile = configProps1.logDir + "/" + topic + "-0/highwatermark"
-  val server2HWFile = configProps2.logDir + "/" + topic + "-0/highwatermark"
-
  val sent1 = List(new Message("hello".getBytes()), new Message("there".getBytes()))
  val sent2 = List( new Message("more".getBytes()), new Message("messages".getBytes()))

  var producer: Producer[Int, Message] = null
-  var hwFile1: RandomAccessFile = null
-  var hwFile2: RandomAccessFile = null
+  var hwFile1: HighwaterMarkCheckpoint = new HighwaterMarkCheckpoint(configProps1.logDir)
+  var hwFile2: HighwaterMarkCheckpoint = new HighwaterMarkCheckpoint(configProps2.logDir)
  var servers: Seq[KafkaServer] = Seq.empty[KafkaServer]

  @Test
@ -66,22 +62,15 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
    // NOTE: this is to avoid transient test failures
    assertTrue("Leader could be broker 0 or broker 1", (leader.getOrElse(-1) == 0) || (leader.getOrElse(-1) == 1))

-    sendMessages()
-
-    hwFile1 = new RandomAccessFile(server1HWFile, "r")
-    hwFile2 = new RandomAccessFile(server2HWFile, "r")
-
-    sendMessages()
+    sendMessages(2)
    // don't wait for follower to read the leader's hw
    // shutdown the servers to allow the hw to be checkpointed
    servers.map(server => server.shutdown())
    producer.close()
-    val leaderHW = readHW(hwFile1)
+    val leaderHW = hwFile1.read(topic, 0)
    assertEquals(60L, leaderHW)
-    val followerHW = readHW(hwFile2)
+    val followerHW = hwFile2.read(topic, 0)
    assertEquals(30L, followerHW)
-    hwFile1.close()
-    hwFile2.close()
    servers.map(server => Utils.rm(server.config.logDir))
  }

@ -105,16 +94,13 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
    // NOTE: this is to avoid transient test failures
    assertTrue("Leader could be broker 0 or broker 1", (leader.getOrElse(-1) == 0) || (leader.getOrElse(-1) == 1))

-    hwFile1 = new RandomAccessFile(server1HWFile, "r")
-    hwFile2 = new RandomAccessFile(server2HWFile, "r")
-
-    assertEquals(0L, readHW(hwFile1))
+    assertEquals(0L, hwFile1.read(topic, 0))

    sendMessages()

    // kill the server hosting the preferred replica
    server1.shutdown()
-    assertEquals(30L, readHW(hwFile1))
+    assertEquals(30L, hwFile1.read(topic, 0))

    // check if leader moves to the other server
    leader = waitUntilLeaderIsElected(zkClient, topic, partitionId, 500)
@ -126,10 +112,10 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
    leader = waitUntilLeaderIsElected(zkClient, topic, partitionId, 500)
    assertEquals("Leader must remain on broker 1", 1, leader.getOrElse(-1))

-    assertEquals(30L, readHW(hwFile1))
+    assertEquals(30L, hwFile1.read(topic, 0))
    // since server 2 was never shut down, the hw value of 30 is probably not checkpointed to disk yet
    server2.shutdown()
-    assertEquals(30L, readHW(hwFile2))
+    assertEquals(30L, hwFile2.read(topic, 0))

    server2.startup()
    leader = waitUntilLeaderIsElected(zkClient, topic, partitionId, 500)
@ -137,17 +123,13 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {

    sendMessages()
    // give some time for follower 1 to record leader HW of 60
-    Thread.sleep(500)
+    TestUtils.waitUntilTrue(() => server2.getReplica(topic, 0).get.highWatermark() == 60L, 500)
+
    // shutdown the servers to allow the hw to be checkpointed
    servers.map(server => server.shutdown())
-    Thread.sleep(200)
    producer.close()
-    assert(hwFile1.length() > 0)
-    assert(hwFile2.length() > 0)
-    assertEquals(60L, readHW(hwFile1))
-    assertEquals(60L, readHW(hwFile2))
-    hwFile1.close()
-    hwFile2.close()
+    assertEquals(60L, hwFile1.read(topic, 0))
+    assertEquals(60L, hwFile2.read(topic, 0))
    servers.map(server => Utils.rm(server.config.logDir))
  }

@ -160,14 +142,14 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
      override val logFileSize = 30
    })

-    val server1HWFile = configs.head.logDir + "/" + topic + "-0/highwatermark"
-    val server2HWFile = configs.last.logDir + "/" + topic + "-0/highwatermark"
-
    // start both servers
    server1 = TestUtils.createServer(configs.head)
    server2 = TestUtils.createServer(configs.last)
    servers ++= List(server1, server2)

+    hwFile1 = new HighwaterMarkCheckpoint(server1.config.logDir)
+    hwFile2 = new HighwaterMarkCheckpoint(server2.config.logDir)
+
    val producerProps = getProducerConfig(zkConnect, 64*1024, 100000, 10000)
    producerProps.put("producer.request.timeout.ms", "1000")
    producerProps.put("producer.request.required.acks", "-1")
@ -181,25 +163,16 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
    assertTrue("Leader should get elected", leader.isDefined)
    // NOTE: this is to avoid transient test failures
    assertTrue("Leader could be broker 0 or broker 1", (leader.getOrElse(-1) == 0) || (leader.getOrElse(-1) == 1))
-
-    sendMessages(10)
-
-    hwFile1 = new RandomAccessFile(server1HWFile, "r")
-    hwFile2 = new RandomAccessFile(server2HWFile, "r")
-
-    sendMessages(10)
-
+    sendMessages(20)
    // give some time for follower 1 to record leader HW of 600
-    Thread.sleep(500)
+    TestUtils.waitUntilTrue(() => server2.getReplica(topic, 0).get.highWatermark() == 600L, 500)
    // shutdown the servers to allow the hw to be checkpointed
    servers.map(server => server.shutdown())
    producer.close()
-    val leaderHW = readHW(hwFile1)
+    val leaderHW = hwFile1.read(topic, 0)
    assertEquals(600L, leaderHW)
-    val followerHW = readHW(hwFile2)
+    val followerHW = hwFile2.read(topic, 0)
    assertEquals(600L, followerHW)
-    hwFile1.close()
-    hwFile2.close()
    servers.map(server => Utils.rm(server.config.logDir))
  }

@ -208,19 +181,18 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
      override val replicaMaxLagTimeMs = 5000L
      override val replicaMaxLagBytes = 10L
      override val flushInterval = 1000
-      override val flushSchedulerThreadRate = 10
      override val replicaMinBytes = 20
      override val logFileSize = 30
    })

-    val server1HWFile = configs.head.logDir + "/" + topic + "-0/highwatermark"
-    val server2HWFile = configs.last.logDir + "/" + topic + "-0/highwatermark"
-
    // start both servers
    server1 = TestUtils.createServer(configs.head)
    server2 = TestUtils.createServer(configs.last)
    servers ++= List(server1, server2)

+    hwFile1 = new HighwaterMarkCheckpoint(server1.config.logDir)
+    hwFile2 = new HighwaterMarkCheckpoint(server2.config.logDir)
+
    val producerProps = getProducerConfig(zkConnect, 64*1024, 100000, 10000)
    producerProps.put("producer.request.timeout.ms", "1000")
    producerProps.put("producer.request.required.acks", "-1")
@ -235,43 +207,36 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
    // NOTE: this is to avoid transient test failures
    assertTrue("Leader could be broker 0 or broker 1", (leader.getOrElse(-1) == 0) || (leader.getOrElse(-1) == 1))

-    val hwFile1 = new RandomAccessFile(server1HWFile, "r")
-    val hwFile2 = new RandomAccessFile(server2HWFile, "r")
-
    sendMessages(2)
    // allow some time for the follower to get the leader HW
-    Thread.sleep(1000)
+    TestUtils.waitUntilTrue(() => server2.getReplica(topic, 0).get.highWatermark() == 60L, 1000)
    // kill the server hosting the preferred replica
    server1.shutdown()
    server2.shutdown()
-    assertEquals(60L, readHW(hwFile1))
-    assertEquals(60L, readHW(hwFile2))
+    assertEquals(60L, hwFile1.read(topic, 0))
+    assertEquals(60L, hwFile2.read(topic, 0))

    server2.startup()
    // check if leader moves to the other server
    leader = waitUntilLeaderIsElected(zkClient, topic, partitionId, 500)
    assertEquals("Leader must move to broker 1", 1, leader.getOrElse(-1))

-    assertEquals(60L, readHW(hwFile1))
+    assertEquals(60L, hwFile1.read(topic, 0))

    // bring the preferred replica back
    server1.startup()

-    assertEquals(60L, readHW(hwFile1))
-    assertEquals(60L, readHW(hwFile2))
+    assertEquals(60L, hwFile1.read(topic, 0))
+    assertEquals(60L, hwFile2.read(topic, 0))

    sendMessages(2)
    // allow some time for the follower to get the leader HW
-    Thread.sleep(1000)
+    TestUtils.waitUntilTrue(() => server1.getReplica(topic, 0).get.highWatermark() == 120L, 1000)
    // shutdown the servers to allow the hw to be checkpointed
    servers.map(server => server.shutdown())
    producer.close()
-    assert(hwFile1.length() > 0)
-    assert(hwFile2.length() > 0)
-    assertEquals(120L, readHW(hwFile1))
-    assertEquals(120L, readHW(hwFile2))
-    hwFile1.close()
-    hwFile2.close()
+    assertEquals(120L, hwFile1.read(topic, 0))
+    assertEquals(120L, hwFile2.read(topic, 0))
    servers.map(server => Utils.rm(server.config.logDir))
  }

@ -280,9 +245,4 @@ class LogRecoveryTest extends JUnit3Suite with ZooKeeperTestHarness {
      producer.send(new ProducerData[Int, Message](topic, 0, sent1))
    }
  }
-
-  private def readHW(hwFile: RandomAccessFile): Long = {
-    hwFile.seek(0)
-    hwFile.readLong()
-  }
 }