Task Deduplication (#1062 )

2026-01-03 20:01:49 +00:00
7 changed files with 903 additions and 581 deletions
--- a/dashboard/src/lib/components/TopologyGraph.svelte
+++ b/dashboard/src/lib/components/TopologyGraph.svelte
--- a/src/exo/shared/types/worker/runners.py
+++ b/src/exo/shared/types/worker/runners.py
@ -53,6 +53,10 @@ class RunnerRunning(BaseRunnerStatus):
    pass


+class RunnerShuttingDown(BaseRunnerStatus):
+    pass
+
+
 class RunnerShutdown(BaseRunnerStatus):
    pass

@ -70,6 +74,7 @@ RunnerStatus = (
    | RunnerWarmingUp
    | RunnerReady
    | RunnerRunning
+    | RunnerShuttingDown
    | RunnerShutdown
    | RunnerFailed
 )
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@ -274,6 +274,12 @@ def _pending_tasks(
            if task.instance_id != runner.bound_instance.instance.instance_id:
                continue

+            # I have a design point here; this is a state race in disguise as the task status doesn't get updated to completed fast enough
+            # however, realistically the task status should be set to completed by the LAST runner, so this is a true race
+            # the actual solution is somewhat deeper than this bypass - TODO!
+            if task.task_id in runner.completed:
+                continue
+
            # TODO: Check ordering aligns with MLX distributeds expectations.

            if isinstance(runner.status, RunnerReady) and all(
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@ -32,6 +32,7 @@ from exo.shared.types.worker.runners import (
    RunnerReady,
    RunnerRunning,
    RunnerShutdown,
+    RunnerShuttingDown,
    RunnerStatus,
    RunnerWarmingUp,
 )
@ -187,13 +188,14 @@ def main(
                        current_status = RunnerReady()
                        logger.info("runner ready")
                    case Shutdown():
+                        current_status = RunnerShuttingDown()
                        logger.info("runner shutting down")
                        event_sender.send(
-                            TaskStatusUpdated(
-                                task_id=task.task_id, task_status=TaskStatus.Complete
+                            RunnerStatusUpdated(
+                                runner_id=runner_id, runner_status=current_status
                            )
                        )
-                        break
+                        current_status = RunnerShutdown()
                    case _:
                        raise ValueError(
                            f"Received {task.__class__.__name__} outside of state machine in {current_status=}"
@ -208,9 +210,8 @@ def main(
                        runner_id=runner_id, runner_status=current_status
                    )
                )
-        event_sender.send(
-            RunnerStatusUpdated(runner_id=runner_id, runner_status=RunnerShutdown())
-        )
+                if isinstance(current_status, RunnerShutdown):
+                    break
    except ClosedResourceError:
        logger.warning("runner communication closed unexpectedly")
    except Exception as e:
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@ -14,13 +14,23 @@ from anyio import (
 from anyio.abc import TaskGroup
 from loguru import logger

-from exo.shared.types.events import Event, RunnerStatusUpdated, TaskAcknowledged
-from exo.shared.types.tasks import Task, TaskId
+from exo.shared.types.events import (
+    Event,
+    RunnerStatusUpdated,
+    TaskAcknowledged,
+    TaskStatusUpdated,
+)
+from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.runners import (
+    RunnerConnecting,
    RunnerFailed,
    RunnerIdle,
+    RunnerLoading,
+    RunnerRunning,
+    RunnerShuttingDown,
    RunnerStatus,
+    RunnerWarmingUp,
 )
 from exo.shared.types.worker.shards import ShardMetadata
 from exo.utils.channels import MpReceiver, MpSender, Sender, mp_channel
@ -39,10 +49,10 @@ class RunnerSupervisor:
    _ev_recv: MpReceiver[Event]
    _task_sender: MpSender[Task]
    _event_sender: Sender[Event]
-    # err_path: str
    _tg: TaskGroup | None = field(default=None, init=False)
    status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
    pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)
+    completed: set[TaskId] = field(default_factory=set, init=False)

    @classmethod
    def create(
@ -77,7 +87,6 @@ class RunnerSupervisor:
            _ev_recv=ev_recv,
            _task_sender=task_sender,
            _event_sender=event_sender,
-            # err_path=err_path,
        )

        return self
@ -118,6 +127,10 @@ class RunnerSupervisor:
        self._tg.cancel_scope.cancel()

    async def start_task(self, task: Task):
+        if task.task_id in self.completed:
+            logger.info(
+                f"Skipping invalid task {task} as it has already been completed"
+            )
        logger.info(f"Starting task {task}")
        event = anyio.Event()
        self.pending[task.task_id] = event
@ -138,6 +151,22 @@ class RunnerSupervisor:
                    if isinstance(event, TaskAcknowledged):
                        self.pending.pop(event.task_id).set()
                        continue
+                    if (
+                        isinstance(event, TaskStatusUpdated)
+                        and event.task_status == TaskStatus.Complete
+                    ):
+                        # If a task has just been completed, we should be working on it.
+                        assert isinstance(
+                            self.status,
+                            (
+                                RunnerRunning,
+                                RunnerWarmingUp,
+                                RunnerLoading,
+                                RunnerConnecting,
+                                RunnerShuttingDown,
+                            ),
+                        )
+                        self.completed.add(event.task_id)
                    await self._event_sender.send(event)
            except (ClosedResourceError, BrokenResourceError) as e:
                await self._check_runner(e)
--- a/src/exo/worker/tests/unittests/conftest.py
+++ b/src/exo/worker/tests/unittests/conftest.py
@ -1,11 +1,9 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
+from dataclasses import dataclass, field

 from exo.shared.types.common import NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelId, ModelMetadata
-from exo.shared.types.tasks import BaseTask
+from exo.shared.types.tasks import BaseTask, TaskId
 from exo.shared.types.worker.instances import (
    BoundInstance,
    Instance,
@ -21,6 +19,7 @@ from exo.shared.types.worker.shards import PipelineShardMetadata, ShardMetadata
 class FakeRunnerSupervisor:
    bound_instance: BoundInstance
    status: RunnerStatus
+    completed: set[TaskId] = field(default_factory=set)


 class OtherTask(BaseTask):
--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@ -34,6 +34,7 @@ from exo.shared.types.worker.runners import (
    RunnerReady,
    RunnerRunning,
    RunnerShutdown,
+    RunnerShuttingDown,
    RunnerWarmingUp,
 )
 from exo.utils.channels import mp_channel
@ -199,6 +200,9 @@ def test_events_processed_in_correct_order(patch_out_mlx: pytest.MonkeyPatch):
            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerReady()),
            TaskStatusUpdated(task_id=SHUTDOWN_TASK_ID, task_status=TaskStatus.Running),
            TaskAcknowledged(task_id=SHUTDOWN_TASK_ID),
+            RunnerStatusUpdated(
+                runner_id=RUNNER_1_ID, runner_status=RunnerShuttingDown()
+            ),
            TaskStatusUpdated(
                task_id=SHUTDOWN_TASK_ID, task_status=TaskStatus.Complete
            ),