Add global BLE lock + faster timeouts for multi-device reliability

Root cause: 3 BMS devices fighting for 3 ESPHome proxy connection slots
simultaneously, causing 80% timeout failures and 22s+ poll times.

Fixes:
- Add shared asyncio.Lock so only one BMS polls at a time — eliminates
  proxy slot contention entirely
- Pass ble_device_callback to establish_connection so retry attempts
  get a fresh BLEDevice (handles proxy path changes)
- Reduce command timeout 5s -> 3s, retries 3 -> 2 (BMS responds in
  <200ms when connection is clean)
- Reduce establish_connection max_attempts 3 -> 2 (fail fast, retry
  next cycle instead of blocking 25s)
- Fixed poll timeout to 15s (was poll_interval-5=25s)

Expected: polls complete in 2-5s instead of 22s, ~95%+ success rate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 10:05:54 +02:00
parent 1520ed3c0f
commit dcc528b96a
3 changed files with 65 additions and 23 deletions
+16 -2
View File
@@ -1,6 +1,8 @@
"""Xiaoxiang Smart BMS — Home Assistant integration.""" """Xiaoxiang Smart BMS — Home Assistant integration."""
from __future__ import annotations from __future__ import annotations
import asyncio
from homeassistant.components.bluetooth import ( from homeassistant.components.bluetooth import (
BluetoothChange, BluetoothChange,
BluetoothScanningMode, BluetoothScanningMode,
@@ -22,7 +24,19 @@ async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:
address = entry.data[CONF_ADDRESS] address = entry.data[CONF_ADDRESS]
poll_interval = entry.options.get(CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL) poll_interval = entry.options.get(CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL)
coordinator = BmsCoordinator(hass, address, poll_interval, name=entry.title) hass.data.setdefault(DOMAIN, {})
# Shared BLE lock — only one BMS connects at a time to avoid
# ESPHome proxy connection slot exhaustion with multiple devices.
if "_ble_lock" not in hass.data[DOMAIN]:
hass.data[DOMAIN]["_ble_lock"] = asyncio.Lock()
ble_lock = hass.data[DOMAIN]["_ble_lock"]
coordinator = BmsCoordinator(
hass, address, poll_interval,
name=entry.title,
ble_lock=ble_lock,
)
# Keep the coordinator's BLE device reference fresh via advertisement callback. # Keep the coordinator's BLE device reference fresh via advertisement callback.
# This avoids stale transport paths when ESPHome proxies cycle. # This avoids stale transport paths when ESPHome proxies cycle.
@@ -45,7 +59,7 @@ async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:
await coordinator.async_setup() await coordinator.async_setup()
await coordinator.async_config_entry_first_refresh() await coordinator.async_config_entry_first_refresh()
hass.data.setdefault(DOMAIN, {})[entry.entry_id] = coordinator hass.data[DOMAIN][entry.entry_id] = coordinator
await hass.config_entries.async_forward_entry_setups(entry, PLATFORMS) await hass.config_entries.async_forward_entry_setups(entry, PLATFORMS)
entry.async_on_unload(entry.add_update_listener(_async_update_listener)) entry.async_on_unload(entry.add_update_listener(_async_update_listener))
@@ -4,6 +4,7 @@ from __future__ import annotations
import asyncio import asyncio
import logging import logging
import struct import struct
from collections.abc import Callable
from bleak import BleakError from bleak import BleakError
from bleak.backends.device import BLEDevice from bleak.backends.device import BLEDevice
@@ -63,8 +64,9 @@ class BmsBluetoothHandler:
self, self,
ble_device: BLEDevice, ble_device: BLEDevice,
commands: list[bytes], commands: list[bytes],
timeout: float = 5.0, timeout: float = 3.0,
retries: int = 3, retries: int = 2,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> list[bytes | None]: ) -> list[bytes | None]:
"""Connect, send each command in sequence, disconnect. """Connect, send each command in sequence, disconnect.
@@ -79,7 +81,8 @@ class BmsBluetoothHandler:
BleakClientWithServiceCache, BleakClientWithServiceCache,
ble_device, ble_device,
self._address, self._address,
max_attempts=3, max_attempts=2,
ble_device_callback=ble_device_callback,
) )
try: try:
await client.start_notify(RX_CHAR_UUID, self._on_notify) await client.start_notify(RX_CHAR_UUID, self._on_notify)
@@ -166,7 +169,7 @@ class BmsBluetoothHandler:
_LOGGER.warning("BLE write failed (attempt %d/%d): %s", _LOGGER.warning("BLE write failed (attempt %d/%d): %s",
attempt, retries, exc) attempt, retries, exc)
if attempt < retries: if attempt < retries:
await asyncio.sleep(0.5) await asyncio.sleep(0.3)
continue continue
try: try:
@@ -178,7 +181,7 @@ class BmsBluetoothHandler:
_LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)", _LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)",
command.hex(), attempt, retries) command.hex(), attempt, retries)
if attempt < retries: if attempt < retries:
await asyncio.sleep(0.5) await asyncio.sleep(0.3)
return None return None
@@ -186,7 +189,12 @@ class BmsBluetoothHandler:
# MOS write command # MOS write command
# ------------------------------------------------------------------ # ------------------------------------------------------------------
async def write_mos(self, ble_device: BLEDevice, value: int) -> bool: async def write_mos(
self,
ble_device: BLEDevice,
value: int,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> bool:
"""Send a MOS control write command and return True on ACK. """Send a MOS control write command and return True on ACK.
Follows the same connect -> send -> disconnect pattern as poll() so Follows the same connect -> send -> disconnect pattern as poll() so
@@ -200,12 +208,13 @@ class BmsBluetoothHandler:
BleakClientWithServiceCache, BleakClientWithServiceCache,
ble_device, ble_device,
self._address, self._address,
max_attempts=3, max_attempts=2,
ble_device_callback=ble_device_callback,
) )
try: try:
await client.start_notify(RX_CHAR_UUID, self._on_notify) await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3) await asyncio.sleep(0.3)
response = await self._request(client, command, timeout=5.0, retries=2) response = await self._request(client, command, timeout=3.0, retries=2)
return response is not None and response[2] == 0x00 return response is not None and response[2] == 0x00
finally: finally:
try: try:
+24 -5
View File
@@ -22,6 +22,11 @@ _LOGGER = logging.getLogger(__name__)
# the last known data instead so the UI doesn't oscillate. # the last known data instead so the UI doesn't oscillate.
_FAILURES_BEFORE_UNAVAILABLE = 5 _FAILURES_BEFORE_UNAVAILABLE = 5
# Hard ceiling on the BLE poll operation (connect + commands + disconnect).
# With the global lock preventing contention, connections should be fast —
# 15 s is generous for 2 commands over a local proxy.
_POLL_TIMEOUT = 15
class BmsCoordinator(DataUpdateCoordinator[dict]): class BmsCoordinator(DataUpdateCoordinator[dict]):
"""Polls the BMS over BLE and distributes data to all sensor entities. """Polls the BMS over BLE and distributes data to all sensor entities.
@@ -36,6 +41,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
address: str, address: str,
poll_interval: int, poll_interval: int,
name: str = "Xiaoxiang Smart BMS", name: str = "Xiaoxiang Smart BMS",
ble_lock: asyncio.Lock | None = None,
) -> None: ) -> None:
super().__init__( super().__init__(
hass, hass,
@@ -45,12 +51,14 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
) )
self.address = address self.address = address
self._device_name = name self._device_name = name
self._poll_timeout = max(poll_interval - 5, 20)
self._handler = BmsBluetoothHandler(address) self._handler = BmsBluetoothHandler(address)
self.hw_version: str | None = None self.hw_version: str | None = None
self._consecutive_failures = 0 self._consecutive_failures = 0
# Kept fresh by the BLE advertisement callback registered in __init__.py # Kept fresh by the BLE advertisement callback registered in __init__.py
self._ble_device: BLEDevice | None = None self._ble_device: BLEDevice | None = None
# Shared across all BMS coordinator instances so only one BMS connects
# at a time — prevents ESPHome proxy connection slot exhaustion.
self._ble_lock = ble_lock or asyncio.Lock()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Device info — shared by sensor, binary_sensor, number platforms # Device info — shared by sensor, binary_sensor, number platforms
@@ -82,7 +90,11 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
raise HomeAssistantError( raise HomeAssistantError(
f"BMS ({self.address}) not reachable — cannot send MOS command" f"BMS ({self.address}) not reachable — cannot send MOS command"
) )
success = await self._handler.write_mos(device, value) async with self._ble_lock:
success = await self._handler.write_mos(
device, value,
ble_device_callback=self._get_ble_device,
)
if not success: if not success:
raise HomeAssistantError("BMS did not acknowledge the MOS command") raise HomeAssistantError("BMS did not acknowledge the MOS command")
await self.async_request_refresh() await self.async_request_refresh()
@@ -135,14 +147,21 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
if self.hw_version is None: if self.hw_version is None:
commands.append(CMD_VERSION) commands.append(CMD_VERSION)
# Only one BMS polls at a time — prevents proxy connection slot contention.
# The timeout wraps only the actual BLE operation, not the lock wait.
async with self._ble_lock:
try: try:
responses = await asyncio.wait_for( responses = await asyncio.wait_for(
self._handler.poll(device, commands), self._handler.poll(
timeout=self._poll_timeout, device,
commands,
ble_device_callback=self._get_ble_device,
),
timeout=_POLL_TIMEOUT,
) )
except asyncio.TimeoutError: except asyncio.TimeoutError:
return self._handle_failure( return self._handle_failure(
f"BMS poll timed out after {self._poll_timeout}s" f"BMS poll timed out after {_POLL_TIMEOUT}s"
) )
except Exception as exc: except Exception as exc:
return self._handle_failure(f"BMS poll failed: {exc}") return self._handle_failure(f"BMS poll failed: {exc}")