Switch to persistent BLE connection model

The connect/disconnect-every-cycle approach caused ~50% failure rate
over 48h — each of the ~2880 daily connection attempts per device had
a significant chance of failure through ESPHome proxies.

New model (same as the user's Android app):
- Connect once, keep the connection alive across poll cycles
- _ensure_connected() reconnects automatically if the link drops
- _on_disconnect() callback detects unexpected disconnections
- On timeout, force-disconnect so next cycle gets a clean reconnect
- Polls now only send commands (no connection overhead) — expected
  completion in <1s instead of 10-25s

Connection lifecycle:
  startup → first poll → _ensure_connected() → persistent
  drop detected → next poll → _ensure_connected() → reconnected
  shutdown → async_teardown() → disconnect()

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:46:20 +02:00
parent dcc528b96a
commit b8bee14839
2 changed files with 133 additions and 108 deletions
@@ -29,15 +29,72 @@ _TRAILER_LEN = 3
class BmsBluetoothHandler: class BmsBluetoothHandler:
"""Protocol framing and parsing for a Xiaoxiang BMS device. """Protocol framing and parsing for a Xiaoxiang BMS device.
Designed for a connect -> poll -> disconnect pattern: the BMS only allows Holds a **persistent** BLE connection. The connection is established on
one simultaneous BLE connection, so we hold it only for the duration of the first poll and kept alive across cycles. If it drops, the next poll
a single data fetch and release it immediately after. automatically reconnects. This avoids the massive overhead of
connect/disconnect every cycle through ESPHome BLE proxies.
""" """
def __init__(self, address: str) -> None: def __init__(self, address: str) -> None:
self._address = address self._address = address
self._buffer = bytearray() self._buffer = bytearray()
self._response_queue: asyncio.Queue[bytes] = asyncio.Queue() self._response_queue: asyncio.Queue[bytes] = asyncio.Queue()
self._client: BleakClientWithServiceCache | None = None
# ------------------------------------------------------------------
# Connection management
# ------------------------------------------------------------------
@property
def is_connected(self) -> bool:
return self._client is not None and self._client.is_connected
def _on_disconnect(self, _client) -> None:
"""Called by bleak when the connection drops unexpectedly."""
_LOGGER.debug("BMS %s disconnected", self._address)
self._client = None
async def _ensure_connected(
self,
ble_device: BLEDevice,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> None:
"""Connect if not already connected."""
if self.is_connected:
return
# Clean up stale client
if self._client is not None:
try:
await self._client.disconnect()
except BleakError:
pass
self._client = None
self._reset()
_LOGGER.debug("Connecting to BMS %s", self._address)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
disconnected_callback=self._on_disconnect,
max_attempts=2,
ble_device_callback=ble_device_callback,
)
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
self._client = client
_LOGGER.info("BMS %s connected", self._address)
async def disconnect(self) -> None:
"""Explicitly close the connection (used during teardown)."""
if self._client is not None:
try:
await self._client.disconnect()
except BleakError:
pass
self._client = None
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Internal helpers # Internal helpers
@@ -52,12 +109,12 @@ class BmsBluetoothHandler:
break break
def _reset(self) -> None: def _reset(self) -> None:
"""Clear all transient state before a new connection.""" """Clear all transient state."""
self._buffer.clear() self._buffer.clear()
self._drain_queue() self._drain_queue()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# High-level poll — the only entry point the coordinator needs # High-level poll
# ------------------------------------------------------------------ # ------------------------------------------------------------------
async def poll( async def poll(
@@ -68,46 +125,26 @@ class BmsBluetoothHandler:
retries: int = 2, retries: int = 2,
ble_device_callback: Callable[[], BLEDevice | None] | None = None, ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> list[bytes | None]: ) -> list[bytes | None]:
"""Connect, send each command in sequence, disconnect. """Send each command over the persistent connection, return responses.
The BMS only supports a single BLE connection at a time. By connecting Connects automatically if not already connected. If the connection
only during the active read window and disconnecting immediately after, drops mid-poll, raises BleakError so the coordinator can handle it.
the mobile app (or any other client) can connect freely between polls.
""" """
self._reset() self._drain_queue()
_LOGGER.debug("Polling BMS at %s", self._address)
client = await establish_connection( await self._ensure_connected(ble_device, ble_device_callback)
BleakClientWithServiceCache,
ble_device, return [
self._address, await self._request(self._client, cmd, timeout, retries)
max_attempts=2, for cmd in commands
ble_device_callback=ble_device_callback, ]
)
try:
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
return [
await self._request(client, cmd, timeout, retries)
for cmd in commands
]
finally:
try:
await client.disconnect()
except BleakError:
pass
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Frame reception # Frame reception
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _on_notify(self, _char, data: bytearray) -> None: def _on_notify(self, _char, data: bytearray) -> None:
"""Accumulate BLE notification chunks into complete protocol frames. """Accumulate BLE notification chunks into complete protocol frames."""
BLE max payload is 20 bytes (default MTU), so a single BMS frame
(up to ~50 bytes for 16 cells) arrives across several notifications.
We buffer until we can calculate and verify the expected frame length.
"""
self._buffer.extend(data) self._buffer.extend(data)
# Discard leading garbage until we see a frame start byte # Discard leading garbage until we see a frame start byte
@@ -140,7 +177,7 @@ class BmsBluetoothHandler:
self._response_queue.put_nowait(frame) self._response_queue.put_nowait(frame)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Request / response (private — used inside poll()) # Request / response
# ------------------------------------------------------------------ # ------------------------------------------------------------------
async def _request( async def _request(
@@ -150,19 +187,22 @@ class BmsBluetoothHandler:
timeout: float, timeout: float,
retries: int, retries: int,
) -> bytes | None: ) -> bytes | None:
"""Send one command and wait for the response frame, with retries. """Send one command and wait for the response frame, with retries."""
Tries Write With Response first; falls back to Write Without Response
if the characteristic rejects it — covers both BMS firmware variants.
"""
for attempt in range(1, retries + 1): for attempt in range(1, retries + 1):
# Drain any stale frames before sending a new command if not client.is_connected:
_LOGGER.warning("BMS %s connection lost during request", self._address)
self._client = None
return None
self._drain_queue() self._drain_queue()
self._buffer.clear() self._buffer.clear()
try: try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=True) await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
except BleakError: except BleakError:
if not client.is_connected:
self._client = None
return None
try: try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=False) await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
except BleakError as exc: except BleakError as exc:
@@ -195,32 +235,15 @@ class BmsBluetoothHandler:
value: int, value: int,
ble_device_callback: Callable[[], BLEDevice | None] | None = None, ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> bool: ) -> bool:
"""Send a MOS control write command and return True on ACK. """Send a MOS control write command over the persistent connection."""
self._drain_queue()
Follows the same connect -> send -> disconnect pattern as poll() so
it doesn't interfere with the normal poll cycle.
"""
self._reset()
command = self._build_mos_command(value) command = self._build_mos_command(value)
_LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address) _LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address)
client = await establish_connection( await self._ensure_connected(ble_device, ble_device_callback)
BleakClientWithServiceCache,
ble_device, response = await self._request(self._client, command, timeout=3.0, retries=2)
self._address, return response is not None and response[2] == 0x00
max_attempts=2,
ble_device_callback=ble_device_callback,
)
try:
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
response = await self._request(client, command, timeout=3.0, retries=2)
return response is not None and response[2] == 0x00
finally:
try:
await client.disconnect()
except BleakError:
pass
@staticmethod @staticmethod
def _build_mos_command(value: int) -> bytes: def _build_mos_command(value: int) -> bytes:
@@ -230,9 +253,6 @@ class BmsBluetoothHandler:
Checked bytes (per spec): command_code + length + data bytes Checked bytes (per spec): command_code + length + data bytes
= 0xE1 + 0x02 + 0x00 + XX = 0xE1 + 0x02 + 0x00 + XX
Checksum = two's complement of sum, high byte first. Checksum = two's complement of sum, high byte first.
Verified against spec example:
XX=0x02 -> sum=0xE5 -> ~0xE5+1=0xFF1B -> CHK FF 1B
""" """
checked = [0xE1, 0x02, 0x00, value & 0xFF] checked = [0xE1, 0x02, 0x00, value & 0xFF]
checksum = (~sum(checked) + 1) & 0xFFFF checksum = (~sum(checked) + 1) & 0xFFFF
@@ -252,7 +272,7 @@ class BmsBluetoothHandler:
Payload byte offsets (frame[4] is payload[0]): Payload byte offsets (frame[4] is payload[0]):
0-1 Total voltage uint16 BE /100 -> V 0-1 Total voltage uint16 BE /100 -> V
2-3 Current int16 BE /100 -> A (positive = charging, negative = discharging) 2-3 Current int16 BE /100 -> A (positive = charging)
4-5 Residual capacity uint16 BE /100 -> Ah 4-5 Residual capacity uint16 BE /100 -> Ah
6-7 Nominal capacity uint16 BE /100 -> Ah 6-7 Nominal capacity uint16 BE /100 -> Ah
8-9 Cycle count uint16 BE 8-9 Cycle count uint16 BE
@@ -287,12 +307,9 @@ class BmsBluetoothHandler:
"state_of_charge": p[19], "state_of_charge": p[19],
"cell_count": p[21], "cell_count": p[21],
"temperatures": temperatures, "temperatures": temperatures,
# MOS status
"mos_charge_enabled": bool(mos & 0x01), "mos_charge_enabled": bool(mos & 0x01),
"mos_discharge_enabled": bool(mos & 0x02), "mos_discharge_enabled": bool(mos & 0x02),
# Cell balancing (any cell currently balancing)
"balance_active": balance != 0, "balance_active": balance != 0,
# Protection flags (bit per event, True = protection triggered)
"prot_cell_overvolt": bool(prot & (1 << 0)), "prot_cell_overvolt": bool(prot & (1 << 0)),
"prot_cell_undervolt": bool(prot & (1 << 1)), "prot_cell_undervolt": bool(prot & (1 << 1)),
"prot_pack_overvolt": bool(prot & (1 << 2)), "prot_pack_overvolt": bool(prot & (1 << 2)),
@@ -319,10 +336,8 @@ class BmsBluetoothHandler:
"""Parse a 0x04 cell voltage response frame. """Parse a 0x04 cell voltage response frame.
Per spec: frame[3] (the header length byte) = cell_count x 2. Per spec: frame[3] (the header length byte) = cell_count x 2.
The payload contains ONLY the voltage bytes — no count byte.
0+ Cell voltages uint16 BE each unit mV /1000 -> V
""" """
count = frame[3] // 2 # header length byte = N_cells x 2 count = frame[3] // 2
p = frame[_HEADER_LEN:-_TRAILER_LEN] p = frame[_HEADER_LEN:-_TRAILER_LEN]
voltages: list[float] = [] voltages: list[float] = []
for i in range(count): for i in range(count):
+42 -32
View File
@@ -5,6 +5,7 @@ import asyncio
import logging import logging
from datetime import timedelta from datetime import timedelta
from bleak import BleakError
from bleak.backends.device import BLEDevice from bleak.backends.device import BLEDevice
from homeassistant.components.bluetooth import async_ble_device_from_address from homeassistant.components.bluetooth import async_ble_device_from_address
from homeassistant.core import HomeAssistant from homeassistant.core import HomeAssistant
@@ -18,21 +19,19 @@ from .const import CMD_CELL, CMD_GENERAL, CMD_VERSION, DOMAIN
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
# Only mark sensors unavailable after this many *consecutive* failed polls. # Only mark sensors unavailable after this many *consecutive* failed polls.
# Transient BLE misses (device not in cache, ESPHome proxy busy, etc.) return
# the last known data instead so the UI doesn't oscillate.
_FAILURES_BEFORE_UNAVAILABLE = 5 _FAILURES_BEFORE_UNAVAILABLE = 5
# Hard ceiling on the BLE poll operation (connect + commands + disconnect). # Hard ceiling on a single poll (commands only — no connection overhead
# With the global lock preventing contention, connections should be fast — # when already connected). Reconnection adds ~5 s on top.
# 15 s is generous for 2 commands over a local proxy.
_POLL_TIMEOUT = 15 _POLL_TIMEOUT = 15
class BmsCoordinator(DataUpdateCoordinator[dict]): class BmsCoordinator(DataUpdateCoordinator[dict]):
"""Polls the BMS over BLE and distributes data to all sensor entities. """Polls the BMS over BLE and distributes data to all sensor entities.
Uses a connect -> read -> disconnect pattern on every poll so the BMS's Holds a **persistent** BLE connection per device. The connection is
single BLE connection slot is free between updates (mobile app access). established on the first poll and kept alive across cycles. If it
drops, the next poll automatically reconnects.
""" """
def __init__( def __init__(
@@ -56,12 +55,12 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
self._consecutive_failures = 0 self._consecutive_failures = 0
# Kept fresh by the BLE advertisement callback registered in __init__.py # Kept fresh by the BLE advertisement callback registered in __init__.py
self._ble_device: BLEDevice | None = None self._ble_device: BLEDevice | None = None
# Shared across all BMS coordinator instances so only one BMS connects # Shared across all BMS coordinator instances — serialises BLE
# at a time — prevents ESPHome proxy connection slot exhaustion. # operations so multiple devices don't fight for proxy slots.
self._ble_lock = ble_lock or asyncio.Lock() self._ble_lock = ble_lock or asyncio.Lock()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Device info — shared by sensor, binary_sensor, number platforms # Device info
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@property @property
@@ -78,10 +77,11 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
# ------------------------------------------------------------------ # ------------------------------------------------------------------
async def async_setup(self) -> None: async def async_setup(self) -> None:
"""No-op — no persistent connection to establish.""" """No-op — connection is established lazily on first poll."""
async def async_teardown(self) -> None: async def async_teardown(self) -> None:
"""No-op — each poll disconnects itself.""" """Disconnect the persistent BLE connection."""
await self._handler.disconnect()
async def async_write_mos(self, value: int) -> None: async def async_write_mos(self, value: int) -> None:
"""Send a MOS control command to the BMS, then refresh sensor state.""" """Send a MOS control command to the BMS, then refresh sensor state."""
@@ -91,10 +91,13 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
f"BMS ({self.address}) not reachable — cannot send MOS command" f"BMS ({self.address}) not reachable — cannot send MOS command"
) )
async with self._ble_lock: async with self._ble_lock:
success = await self._handler.write_mos( try:
device, value, success = await self._handler.write_mos(
ble_device_callback=self._get_ble_device, device, value,
) ble_device_callback=self._get_ble_device,
)
except (BleakError, asyncio.TimeoutError) as exc:
raise HomeAssistantError(f"MOS command failed: {exc}") from exc
if not success: if not success:
raise HomeAssistantError("BMS did not acknowledge the MOS command") raise HomeAssistantError("BMS did not acknowledge the MOS command")
await self.async_request_refresh() await self.async_request_refresh()
@@ -104,25 +107,22 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _get_ble_device(self) -> BLEDevice | None: def _get_ble_device(self) -> BLEDevice | None:
"""Return the freshest available BLEDevice reference. """Return the freshest available BLEDevice reference."""
if self._ble_device is not None:
Prefers the advertisement-callback reference (_ble_device) because it return self._ble_device
tracks proxy transport path changes. Falls back to the scanner cache. device = async_ble_device_from_address(
"""
return self._ble_device or async_ble_device_from_address(
self.hass, self.address, connectable=True self.hass, self.address, connectable=True
) )
if device is not None:
_LOGGER.debug("BMS %s found via scanner cache", self.address)
return device
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Poll # Poll
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _handle_failure(self, reason: str) -> dict: def _handle_failure(self, reason: str) -> dict:
"""On a transient failure, return cached data up to the threshold. """Return cached data up to the threshold, then go unavailable."""
Only raises UpdateFailed (-> sensors go unavailable) after
_FAILURES_BEFORE_UNAVAILABLE consecutive misses.
"""
self._consecutive_failures += 1 self._consecutive_failures += 1
if self._consecutive_failures <= _FAILURES_BEFORE_UNAVAILABLE and self.data: if self._consecutive_failures <= _FAILURES_BEFORE_UNAVAILABLE and self.data:
_LOGGER.debug( _LOGGER.debug(
@@ -135,9 +135,17 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
raise UpdateFailed(reason) raise UpdateFailed(reason)
async def _async_update_data(self) -> dict: async def _async_update_data(self) -> dict:
"""Connect to the BMS, fetch all data, disconnect.""" """Poll the BMS over the persistent connection."""
device = self._get_ble_device() device = self._get_ble_device()
if device is None:
# Wait up to 5 s for an advertisement (after HA restart / proxy reconnect)
_LOGGER.debug("BMS %s not discovered yet, waiting…", self.address)
for _ in range(5):
await asyncio.sleep(1.0)
device = self._get_ble_device()
if device is not None:
break
if device is None: if device is None:
return self._handle_failure( return self._handle_failure(
f"BMS ({self.address}) not reachable — check Bluetooth adapter / proxy" f"BMS ({self.address}) not reachable — check Bluetooth adapter / proxy"
@@ -147,8 +155,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
if self.hw_version is None: if self.hw_version is None:
commands.append(CMD_VERSION) commands.append(CMD_VERSION)
# Only one BMS polls at a time — prevents proxy connection slot contention. # Serialise BLE operations across all BMS instances.
# The timeout wraps only the actual BLE operation, not the lock wait.
async with self._ble_lock: async with self._ble_lock:
try: try:
responses = await asyncio.wait_for( responses = await asyncio.wait_for(
@@ -160,10 +167,12 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
timeout=_POLL_TIMEOUT, timeout=_POLL_TIMEOUT,
) )
except asyncio.TimeoutError: except asyncio.TimeoutError:
# Connection might be dead — force reconnect next cycle
await self._handler.disconnect()
return self._handle_failure( return self._handle_failure(
f"BMS poll timed out after {_POLL_TIMEOUT}s" f"BMS poll timed out after {_POLL_TIMEOUT}s"
) )
except Exception as exc: except (BleakError, Exception) as exc:
return self._handle_failure(f"BMS poll failed: {exc}") return self._handle_failure(f"BMS poll failed: {exc}")
general_frame, cell_frame = responses[0], responses[1] general_frame, cell_frame = responses[0], responses[1]
@@ -194,9 +203,10 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
data["cell_delta"] = None data["cell_delta"] = None
_LOGGER.debug( _LOGGER.debug(
"BMS data: %.2fV %.2fA %d%% %.2fAh %.3fkWh %d cells", "BMS data: %.2fV %.2fA %d%% %.2fAh %.3fkWh %d cells (connected: %s)",
data["voltage"], data["current"], data["state_of_charge"], data["voltage"], data["current"], data["state_of_charge"],
data["residual_capacity"], data["energy_stored"], data["residual_capacity"], data["energy_stored"],
len(data["cell_voltages"]), len(data["cell_voltages"]),
self._handler.is_connected,
) )
return data return data