Refactor BLE layer for 24/7 reliability

- Replace raw BleakClient with establish_connection from
  bleak-retry-connector (retries, GATT service cache, proxy-aware)
- Replace fragile asyncio.Event with asyncio.Queue for response frames,
  drain stale data on each connection to prevent cross-cycle leakage
- Register BLE advertisement callback to keep BLEDevice reference fresh
  across ESPHome proxy path changes
- Remove asyncio.sleep(2) device lookup hack
- Increase poll timeout floor from 10s to 20s
- Increase failure tolerance from 3 to 5 consecutive misses
- Bump default poll interval to 30s, min to 15s (halves connection churn)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 09:36:08 +02:00
parent c00d9b66c2
commit 1520ed3c0f
4 changed files with 131 additions and 72 deletions
+27 -2
View File
@@ -1,8 +1,15 @@
"""Xiaoxiang Smart BMS — Home Assistant integration.""" """Xiaoxiang Smart BMS — Home Assistant integration."""
from __future__ import annotations from __future__ import annotations
from homeassistant.components.bluetooth import (
BluetoothChange,
BluetoothScanningMode,
BluetoothServiceInfoBleak,
async_register_callback,
)
from homeassistant.components.bluetooth.match import ADDRESS, BluetoothCallbackMatcher
from homeassistant.config_entries import ConfigEntry from homeassistant.config_entries import ConfigEntry
from homeassistant.core import HomeAssistant from homeassistant.core import HomeAssistant, callback
from .const import CONF_ADDRESS, CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL, DOMAIN from .const import CONF_ADDRESS, CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL, DOMAIN
from .coordinator import BmsCoordinator from .coordinator import BmsCoordinator
@@ -16,13 +23,31 @@ async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:
poll_interval = entry.options.get(CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL) poll_interval = entry.options.get(CONF_POLL_INTERVAL, DEFAULT_POLL_INTERVAL)
coordinator = BmsCoordinator(hass, address, poll_interval, name=entry.title) coordinator = BmsCoordinator(hass, address, poll_interval, name=entry.title)
# Keep the coordinator's BLE device reference fresh via advertisement callback.
# This avoids stale transport paths when ESPHome proxies cycle.
@callback
def _async_update_ble(
service_info: BluetoothServiceInfoBleak,
change: BluetoothChange,
) -> None:
coordinator._ble_device = service_info.device
entry.async_on_unload(
async_register_callback(
hass,
_async_update_ble,
BluetoothCallbackMatcher({ADDRESS: address}),
BluetoothScanningMode.PASSIVE,
)
)
await coordinator.async_setup() await coordinator.async_setup()
await coordinator.async_config_entry_first_refresh() await coordinator.async_config_entry_first_refresh()
hass.data.setdefault(DOMAIN, {})[entry.entry_id] = coordinator hass.data.setdefault(DOMAIN, {})[entry.entry_id] = coordinator
await hass.config_entries.async_forward_entry_setups(entry, PLATFORMS) await hass.config_entries.async_forward_entry_setups(entry, PLATFORMS)
# Reload the entry when options (e.g. poll interval) change
entry.async_on_unload(entry.add_update_listener(_async_update_listener)) entry.async_on_unload(entry.add_update_listener(_async_update_listener))
return True return True
@@ -5,8 +5,9 @@ import asyncio
import logging import logging
import struct import struct
from bleak import BleakClient, BleakError from bleak import BleakError
from bleak.backends.device import BLEDevice from bleak.backends.device import BLEDevice
from bleak_retry_connector import establish_connection, BleakClientWithServiceCache
from .const import ( from .const import (
FRAME_END, FRAME_END,
@@ -19,7 +20,7 @@ _LOGGER = logging.getLogger(__name__)
# Full frame layout: # Full frame layout:
# [0xDD] [CMD] [STATUS] [PAYLOAD_LEN] [PAYLOAD...] [CHK_HI] [CHK_LO] [0x77] # [0xDD] [CMD] [STATUS] [PAYLOAD_LEN] [PAYLOAD...] [CHK_HI] [CHK_LO] [0x77]
# Header = 4 bytes, trailer = 3 bytes (checksum × 2 + end marker) # Header = 4 bytes, trailer = 3 bytes (checksum x 2 + end marker)
_HEADER_LEN = 4 _HEADER_LEN = 4
_TRAILER_LEN = 3 _TRAILER_LEN = 3
@@ -27,7 +28,7 @@ _TRAILER_LEN = 3
class BmsBluetoothHandler: class BmsBluetoothHandler:
"""Protocol framing and parsing for a Xiaoxiang BMS device. """Protocol framing and parsing for a Xiaoxiang BMS device.
Designed for a connect poll disconnect pattern: the BMS only allows Designed for a connect -> poll -> disconnect pattern: the BMS only allows
one simultaneous BLE connection, so we hold it only for the duration of one simultaneous BLE connection, so we hold it only for the duration of
a single data fetch and release it immediately after. a single data fetch and release it immediately after.
""" """
@@ -35,9 +36,24 @@ class BmsBluetoothHandler:
def __init__(self, address: str) -> None: def __init__(self, address: str) -> None:
self._address = address self._address = address
self._buffer = bytearray() self._buffer = bytearray()
self._response_event = asyncio.Event() self._response_queue: asyncio.Queue[bytes] = asyncio.Queue()
self._response_data: bytes | None = None
self._lock = asyncio.Lock() # ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _drain_queue(self) -> None:
"""Discard any stale frames left in the queue from a prior cycle."""
while not self._response_queue.empty():
try:
self._response_queue.get_nowait()
except asyncio.QueueEmpty:
break
def _reset(self) -> None:
"""Clear all transient state before a new connection."""
self._buffer.clear()
self._drain_queue()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# High-level poll — the only entry point the coordinator needs # High-level poll — the only entry point the coordinator needs
@@ -47,7 +63,7 @@ class BmsBluetoothHandler:
self, self,
ble_device: BLEDevice, ble_device: BLEDevice,
commands: list[bytes], commands: list[bytes],
timeout: float = 3.0, timeout: float = 5.0,
retries: int = 3, retries: int = 3,
) -> list[bytes | None]: ) -> list[bytes | None]:
"""Connect, send each command in sequence, disconnect. """Connect, send each command in sequence, disconnect.
@@ -56,13 +72,17 @@ class BmsBluetoothHandler:
only during the active read window and disconnecting immediately after, only during the active read window and disconnecting immediately after,
the mobile app (or any other client) can connect freely between polls. the mobile app (or any other client) can connect freely between polls.
""" """
self._reset()
_LOGGER.debug("Polling BMS at %s", self._address) _LOGGER.debug("Polling BMS at %s", self._address)
client = BleakClient(ble_device)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=3,
)
try: try:
await client.connect()
await client.start_notify(RX_CHAR_UUID, self._on_notify) await client.start_notify(RX_CHAR_UUID, self._on_notify)
# Give the BMS a moment to register the subscription before
# we start sending commands
await asyncio.sleep(0.3) await asyncio.sleep(0.3)
return [ return [
await self._request(client, cmd, timeout, retries) await self._request(client, cmd, timeout, retries)
@@ -71,9 +91,8 @@ class BmsBluetoothHandler:
finally: finally:
try: try:
await client.disconnect() await client.disconnect()
except Exception: except BleakError:
pass pass
self._buffer.clear()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Frame reception # Frame reception
@@ -115,8 +134,7 @@ class BmsBluetoothHandler:
return return
_LOGGER.debug("BMS frame received (cmd=0x%02X, len=%d)", frame[1], payload_len) _LOGGER.debug("BMS frame received (cmd=0x%02X, len=%d)", frame[1], payload_len)
self._response_data = frame self._response_queue.put_nowait(frame)
self._response_event.set()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Request / response (private — used inside poll()) # Request / response (private — used inside poll())
@@ -124,7 +142,7 @@ class BmsBluetoothHandler:
async def _request( async def _request(
self, self,
client: BleakClient, client: BleakClientWithServiceCache,
command: bytes, command: bytes,
timeout: float, timeout: float,
retries: int, retries: int,
@@ -134,30 +152,33 @@ class BmsBluetoothHandler:
Tries Write With Response first; falls back to Write Without Response Tries Write With Response first; falls back to Write Without Response
if the characteristic rejects it — covers both BMS firmware variants. if the characteristic rejects it — covers both BMS firmware variants.
""" """
async with self._lock:
for attempt in range(1, retries + 1): for attempt in range(1, retries + 1):
self._response_event.clear() # Drain any stale frames before sending a new command
self._response_data = None self._drain_queue()
self._buffer.clear()
try: try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=True) await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
except BleakError: except BleakError:
try: try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=False) await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
except BleakError as exc: except BleakError as exc:
_LOGGER.error("BLE write failed (attempt %d/%d): %s", _LOGGER.warning("BLE write failed (attempt %d/%d): %s",
attempt, retries, exc) attempt, retries, exc)
if attempt < retries: if attempt < retries:
await asyncio.sleep(0.3) await asyncio.sleep(0.5)
continue continue
try: try:
await asyncio.wait_for(self._response_event.wait(), timeout) frame = await asyncio.wait_for(
return self._response_data self._response_queue.get(), timeout
)
return frame
except asyncio.TimeoutError: except asyncio.TimeoutError:
_LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)", _LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)",
command.hex(), attempt, retries) command.hex(), attempt, retries)
if attempt < retries: if attempt < retries:
await asyncio.sleep(0.3) await asyncio.sleep(0.5)
return None return None
@@ -168,25 +189,29 @@ class BmsBluetoothHandler:
async def write_mos(self, ble_device: BLEDevice, value: int) -> bool: async def write_mos(self, ble_device: BLEDevice, value: int) -> bool:
"""Send a MOS control write command and return True on ACK. """Send a MOS control write command and return True on ACK.
Follows the same connect send disconnect pattern as poll() so Follows the same connect -> send -> disconnect pattern as poll() so
it doesn't interfere with the normal poll cycle. it doesn't interfere with the normal poll cycle.
""" """
self._reset()
command = self._build_mos_command(value) command = self._build_mos_command(value)
_LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address) _LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address)
client = BleakClient(ble_device)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=3,
)
try: try:
await client.connect()
await client.start_notify(RX_CHAR_UUID, self._on_notify) await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.5) await asyncio.sleep(0.3)
response = await self._request(client, command, timeout=3.0, retries=2) response = await self._request(client, command, timeout=5.0, retries=2)
# Response: DD E1 00 00 CHK_H CHK_L 77 (status byte 0x00 = OK)
return response is not None and response[2] == 0x00 return response is not None and response[2] == 0x00
finally: finally:
try: try:
await client.disconnect() await client.disconnect()
except Exception: except BleakError:
pass pass
self._buffer.clear()
@staticmethod @staticmethod
def _build_mos_command(value: int) -> bytes: def _build_mos_command(value: int) -> bytes:
@@ -198,7 +223,7 @@ class BmsBluetoothHandler:
Checksum = two's complement of sum, high byte first. Checksum = two's complement of sum, high byte first.
Verified against spec example: Verified against spec example:
XX=0x02 sum=0xE5 ~0xE5+1=0xFF1B CHK FF 1B XX=0x02 -> sum=0xE5 -> ~0xE5+1=0xFF1B -> CHK FF 1B
""" """
checked = [0xE1, 0x02, 0x00, value & 0xFF] checked = [0xE1, 0x02, 0x00, value & 0xFF]
checksum = (~sum(checked) + 1) & 0xFFFF checksum = (~sum(checked) + 1) & 0xFFFF
@@ -217,10 +242,10 @@ class BmsBluetoothHandler:
"""Parse a 0x03 general info response frame. """Parse a 0x03 general info response frame.
Payload byte offsets (frame[4] is payload[0]): Payload byte offsets (frame[4] is payload[0]):
0-1 Total voltage uint16 BE ÷100 V 0-1 Total voltage uint16 BE /100 -> V
2-3 Current int16 BE ÷100 A (positive = charging, negative = discharging) 2-3 Current int16 BE /100 -> A (positive = charging, negative = discharging)
4-5 Residual capacity uint16 BE ÷100 Ah 4-5 Residual capacity uint16 BE /100 -> Ah
6-7 Nominal capacity uint16 BE ÷100 Ah 6-7 Nominal capacity uint16 BE /100 -> Ah
8-9 Cycle count uint16 BE 8-9 Cycle count uint16 BE
10-11 Production date (ignored) 10-11 Production date (ignored)
12-15 Balance status (ignored) 12-15 Balance status (ignored)
@@ -230,7 +255,7 @@ class BmsBluetoothHandler:
20 MOS status uint8 20 MOS status uint8
21 Cell count uint8 21 Cell count uint8
22 Temp probe count uint8 22 Temp probe count uint8
23+ Temperatures uint16 BE each (raw 2731) ÷ 10 → °C 23+ Temperatures uint16 BE each (raw - 2731) / 10 -> C
""" """
p = frame[_HEADER_LEN:-_TRAILER_LEN] p = frame[_HEADER_LEN:-_TRAILER_LEN]
@@ -284,11 +309,11 @@ class BmsBluetoothHandler:
def parse_cell_info(frame: bytes) -> dict: def parse_cell_info(frame: bytes) -> dict:
"""Parse a 0x04 cell voltage response frame. """Parse a 0x04 cell voltage response frame.
Per spec: frame[3] (the header length byte) = cell_count × 2. Per spec: frame[3] (the header length byte) = cell_count x 2.
The payload contains ONLY the voltage bytes — no count byte. The payload contains ONLY the voltage bytes — no count byte.
0+ Cell voltages uint16 BE each unit mV ÷1000 V 0+ Cell voltages uint16 BE each unit mV /1000 -> V
""" """
count = frame[3] // 2 # header length byte = N_cells × 2 count = frame[3] // 2 # header length byte = N_cells x 2
p = frame[_HEADER_LEN:-_TRAILER_LEN] p = frame[_HEADER_LEN:-_TRAILER_LEN]
voltages: list[float] = [] voltages: list[float] = []
for i in range(count): for i in range(count):
+2 -2
View File
@@ -5,8 +5,8 @@ DOMAIN = "xiaoxiang_bms"
CONF_ADDRESS = "address" CONF_ADDRESS = "address"
CONF_POLL_INTERVAL = "poll_interval" CONF_POLL_INTERVAL = "poll_interval"
DEFAULT_POLL_INTERVAL = 15 # seconds — each poll does a full BLE connect/disconnect DEFAULT_POLL_INTERVAL = 30 # seconds — each poll does a full BLE connect/disconnect
MIN_POLL_INTERVAL = 10 # below this the BMS has no breathing room between polls MIN_POLL_INTERVAL = 15 # below this the BMS has no breathing room between polls
MAX_POLL_INTERVAL = 300 MAX_POLL_INTERVAL = 300
# GATT UUIDs (Xiaoxiang BMS UART-over-GATT) # GATT UUIDs (Xiaoxiang BMS UART-over-GATT)
+23 -14
View File
@@ -5,6 +5,7 @@ import asyncio
import logging import logging
from datetime import timedelta from datetime import timedelta
from bleak.backends.device import BLEDevice
from homeassistant.components.bluetooth import async_ble_device_from_address from homeassistant.components.bluetooth import async_ble_device_from_address
from homeassistant.core import HomeAssistant from homeassistant.core import HomeAssistant
from homeassistant.exceptions import HomeAssistantError from homeassistant.exceptions import HomeAssistantError
@@ -19,13 +20,13 @@ _LOGGER = logging.getLogger(__name__)
# Only mark sensors unavailable after this many *consecutive* failed polls. # Only mark sensors unavailable after this many *consecutive* failed polls.
# Transient BLE misses (device not in cache, ESPHome proxy busy, etc.) return # Transient BLE misses (device not in cache, ESPHome proxy busy, etc.) return
# the last known data instead so the UI doesn't oscillate. # the last known data instead so the UI doesn't oscillate.
_FAILURES_BEFORE_UNAVAILABLE = 3 _FAILURES_BEFORE_UNAVAILABLE = 5
class BmsCoordinator(DataUpdateCoordinator[dict]): class BmsCoordinator(DataUpdateCoordinator[dict]):
"""Polls the BMS over BLE and distributes data to all sensor entities. """Polls the BMS over BLE and distributes data to all sensor entities.
Uses a connect read disconnect pattern on every poll so the BMS's Uses a connect -> read -> disconnect pattern on every poll so the BMS's
single BLE connection slot is free between updates (mobile app access). single BLE connection slot is free between updates (mobile app access).
""" """
@@ -44,10 +45,12 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
) )
self.address = address self.address = address
self._device_name = name self._device_name = name
self._poll_timeout = max(poll_interval - 3, 10) self._poll_timeout = max(poll_interval - 5, 20)
self._handler = BmsBluetoothHandler(address) self._handler = BmsBluetoothHandler(address)
self.hw_version: str | None = None self.hw_version: str | None = None
self._consecutive_failures = 0 self._consecutive_failures = 0
# Kept fresh by the BLE advertisement callback registered in __init__.py
self._ble_device: BLEDevice | None = None
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Device info — shared by sensor, binary_sensor, number platforms # Device info — shared by sensor, binary_sensor, number platforms
@@ -74,7 +77,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
async def async_write_mos(self, value: int) -> None: async def async_write_mos(self, value: int) -> None:
"""Send a MOS control command to the BMS, then refresh sensor state.""" """Send a MOS control command to the BMS, then refresh sensor state."""
device = async_ble_device_from_address(self.hass, self.address, connectable=True) device = self._get_ble_device()
if device is None: if device is None:
raise HomeAssistantError( raise HomeAssistantError(
f"BMS ({self.address}) not reachable — cannot send MOS command" f"BMS ({self.address}) not reachable — cannot send MOS command"
@@ -82,9 +85,22 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
success = await self._handler.write_mos(device, value) success = await self._handler.write_mos(device, value)
if not success: if not success:
raise HomeAssistantError("BMS did not acknowledge the MOS command") raise HomeAssistantError("BMS did not acknowledge the MOS command")
# Refresh immediately so sensors reflect the new MOS state
await self.async_request_refresh() await self.async_request_refresh()
# ------------------------------------------------------------------
# BLE device lookup
# ------------------------------------------------------------------
def _get_ble_device(self) -> BLEDevice | None:
"""Return the freshest available BLEDevice reference.
Prefers the advertisement-callback reference (_ble_device) because it
tracks proxy transport path changes. Falls back to the scanner cache.
"""
return self._ble_device or async_ble_device_from_address(
self.hass, self.address, connectable=True
)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Poll # Poll
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@@ -92,7 +108,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
def _handle_failure(self, reason: str) -> dict: def _handle_failure(self, reason: str) -> dict:
"""On a transient failure, return cached data up to the threshold. """On a transient failure, return cached data up to the threshold.
Only raises UpdateFailed ( sensors go unavailable) after Only raises UpdateFailed (-> sensors go unavailable) after
_FAILURES_BEFORE_UNAVAILABLE consecutive misses. _FAILURES_BEFORE_UNAVAILABLE consecutive misses.
""" """
self._consecutive_failures += 1 self._consecutive_failures += 1
@@ -109,14 +125,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
async def _async_update_data(self) -> dict: async def _async_update_data(self) -> dict:
"""Connect to the BMS, fetch all data, disconnect.""" """Connect to the BMS, fetch all data, disconnect."""
# The BMS may not be in the scanner cache immediately after a disconnect. device = self._get_ble_device()
# Wait up to 2 s for an advertisement before giving up.
device = async_ble_device_from_address(self.hass, self.address, connectable=True)
if device is None:
await asyncio.sleep(2.0)
device = async_ble_device_from_address(
self.hass, self.address, connectable=True
)
if device is None: if device is None:
return self._handle_failure( return self._handle_failure(
f"BMS ({self.address}) not reachable — check Bluetooth adapter / proxy" f"BMS ({self.address}) not reachable — check Bluetooth adapter / proxy"