Fix #8: scheduler survives link death; timed-out one-offs cancelled

- A transport exception in the poll loop killed the thread silently, leaving the
  GUI on a frozen 'Connected' dashboard and blocking run_oneoff callers for the
  full timeout. _loop now catches it -> stops, fails pending one-offs with the
  real error, and calls an on_error callback. Controller wires on_error to flag
  the connection dead; the GUI detects it in _tick and tears down with a
  'Connection lost' dialog.
- A run_oneoff that timed out left its job queued, so it executed LATER on the
  shared link -- a ghost/duplicate vehicle command. Jobs now carry
  cancelled/started flags under a lock; on timeout a not-yet-started job is
  cancelled (skipped by _drain_oneoffs), and a started one reports 'still
  running -- do NOT retry'. stop() also frees stranded submitters.
- tests/test_scheduler.py: cancel-on-timeout, freed-on-death, loop-survives.

Closes #8

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_016yT89n4zR4qbrySoSiEyZs
This commit is contained in:
2026-07-01 19:33:33 -04:00
parent b5e0c96763
commit 23c92018c1
4 changed files with 154 additions and 13 deletions
+59 -12
View File
@@ -16,14 +16,19 @@ import time
class _OneOff:
"""A single command to run once on the polling thread (DTC read/clear,
probe, etc). The submitter blocks on `done` until the thread has run it."""
__slots__ = ("fn", "done", "result", "error")
probe, etc). The submitter blocks on `done` until the thread has run it.
`cancelled`/`started` (guarded by `lock`) let a timed-out submitter cancel a
still-queued job so it never fires late on the vehicle."""
__slots__ = ("fn", "done", "result", "error", "cancelled", "started", "lock")
def __init__(self, fn):
self.fn = fn
self.done = threading.Event()
self.result = None
self.error = None
self.cancelled = False
self.started = False
self.lock = threading.Lock()
class _Sub:
@@ -39,13 +44,14 @@ class _Sub:
class PollScheduler:
def __init__(self, link, registry, store, clock=time.time, dead_after=4,
revive_every=5.0):
revive_every=5.0, on_error=None):
self.link = link
self.reg = registry
self.store = store
self.clock = clock
self.dead_after = dead_after
self.revive_every = revive_every
self.on_error = on_error # called(exc) if the poll thread dies
self._subs = {}
self._lock = threading.Lock()
self._thread = None
@@ -102,6 +108,10 @@ class PollScheduler:
job = self._oneoffs.get_nowait()
except queue.Empty:
return
with job.lock: # a timed-out submitter may have cancelled
if job.cancelled:
continue
job.started = True
try:
job.result = job.fn()
except Exception as e: # hand the failure back
@@ -109,17 +119,40 @@ class PollScheduler:
finally:
job.done.set()
def _fail_pending_oneoffs(self, exc):
"""Fail every still-queued (not yet started) one-off with `exc` so a
blocked submitter is freed immediately instead of hanging the full
timeout -- used when the poll thread dies or stops."""
err = exc if isinstance(exc, BaseException) else RuntimeError(str(exc))
while True:
try:
job = self._oneoffs.get_nowait()
except queue.Empty:
return
with job.lock:
if job.started or job.cancelled:
continue
job.cancelled = True
job.error = err
job.done.set()
def run_oneoff(self, fn, timeout=8.0):
"""Enqueue `fn` to run once on the polling thread and block for its
result (or re-raise its exception). When the scheduler thread isn't
running, the job is drained inline on the caller -- still serialized
against tick(), and safe because nothing else is touching the link."""
result (or re-raise its exception). When no live polling thread is
servicing the queue, the job is drained inline on the caller -- still
serialized against tick(), and safe because nothing else touches the
link. On timeout a still-queued job is CANCELLED so it can never fire
late on the vehicle."""
job = _OneOff(fn)
self._oneoffs.put(job)
if not self._running:
if not self._running or (self._thread is not None and not self._thread.is_alive()):
self._drain_oneoffs()
if not job.done.wait(timeout):
raise TimeoutError("one-off command timed out")
with job.lock:
if not job.started:
job.cancelled = True
raise TimeoutError("command timed out and was cancelled — it will not run")
raise TimeoutError("command is still running on the adapter — do NOT retry")
if job.error is not None:
raise job.error
return job.result
@@ -167,16 +200,30 @@ class PollScheduler:
self._thread.start()
def _loop(self):
while self._running:
n = self.tick()
if n == 0:
time.sleep(0.005) # nothing due; yield
try:
while self._running:
n = self.tick()
if n == 0:
time.sleep(0.005) # nothing due; yield
except Exception as e:
# a transport/link error would otherwise kill the thread silently,
# leaving the GUI showing "Connected" with frozen data and blocked
# one-off callers hung. Fail loudly instead.
self._running = False
self._fail_pending_oneoffs(e)
cb = self.on_error
if cb:
try:
cb(e)
except Exception:
pass
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=2.0)
self._thread = None
self._fail_pending_oneoffs(RuntimeError("scheduler stopped"))
def _is_derived(reg, key):