Switch to persistent BLE connection model
The connect/disconnect-every-cycle approach caused ~50% failure rate over 48h — each of the ~2880 daily connection attempts per device had a significant chance of failure through ESPHome proxies. New model (same as the user's Android app): - Connect once, keep the connection alive across poll cycles - _ensure_connected() reconnects automatically if the link drops - _on_disconnect() callback detects unexpected disconnections - On timeout, force-disconnect so next cycle gets a clean reconnect - Polls now only send commands (no connection overhead) — expected completion in <1s instead of 10-25s Connection lifecycle: startup → first poll → _ensure_connected() → persistent drop detected → next poll → _ensure_connected() → reconnected shutdown → async_teardown() → disconnect() Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,15 +29,72 @@ _TRAILER_LEN = 3
|
||||
class BmsBluetoothHandler:
|
||||
"""Protocol framing and parsing for a Xiaoxiang BMS device.
|
||||
|
||||
Designed for a connect -> poll -> disconnect pattern: the BMS only allows
|
||||
one simultaneous BLE connection, so we hold it only for the duration of
|
||||
a single data fetch and release it immediately after.
|
||||
Holds a **persistent** BLE connection. The connection is established on
|
||||
the first poll and kept alive across cycles. If it drops, the next poll
|
||||
automatically reconnects. This avoids the massive overhead of
|
||||
connect/disconnect every cycle through ESPHome BLE proxies.
|
||||
"""
|
||||
|
||||
def __init__(self, address: str) -> None:
|
||||
self._address = address
|
||||
self._buffer = bytearray()
|
||||
self._response_queue: asyncio.Queue[bytes] = asyncio.Queue()
|
||||
self._client: BleakClientWithServiceCache | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Connection management
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def is_connected(self) -> bool:
|
||||
return self._client is not None and self._client.is_connected
|
||||
|
||||
def _on_disconnect(self, _client) -> None:
|
||||
"""Called by bleak when the connection drops unexpectedly."""
|
||||
_LOGGER.debug("BMS %s disconnected", self._address)
|
||||
self._client = None
|
||||
|
||||
async def _ensure_connected(
|
||||
self,
|
||||
ble_device: BLEDevice,
|
||||
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
|
||||
) -> None:
|
||||
"""Connect if not already connected."""
|
||||
if self.is_connected:
|
||||
return
|
||||
|
||||
# Clean up stale client
|
||||
if self._client is not None:
|
||||
try:
|
||||
await self._client.disconnect()
|
||||
except BleakError:
|
||||
pass
|
||||
self._client = None
|
||||
|
||||
self._reset()
|
||||
|
||||
_LOGGER.debug("Connecting to BMS %s", self._address)
|
||||
client = await establish_connection(
|
||||
BleakClientWithServiceCache,
|
||||
ble_device,
|
||||
self._address,
|
||||
disconnected_callback=self._on_disconnect,
|
||||
max_attempts=2,
|
||||
ble_device_callback=ble_device_callback,
|
||||
)
|
||||
await client.start_notify(RX_CHAR_UUID, self._on_notify)
|
||||
await asyncio.sleep(0.3)
|
||||
self._client = client
|
||||
_LOGGER.info("BMS %s connected", self._address)
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Explicitly close the connection (used during teardown)."""
|
||||
if self._client is not None:
|
||||
try:
|
||||
await self._client.disconnect()
|
||||
except BleakError:
|
||||
pass
|
||||
self._client = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
@@ -52,12 +109,12 @@ class BmsBluetoothHandler:
|
||||
break
|
||||
|
||||
def _reset(self) -> None:
|
||||
"""Clear all transient state before a new connection."""
|
||||
"""Clear all transient state."""
|
||||
self._buffer.clear()
|
||||
self._drain_queue()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# High-level poll — the only entry point the coordinator needs
|
||||
# High-level poll
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def poll(
|
||||
@@ -68,46 +125,26 @@ class BmsBluetoothHandler:
|
||||
retries: int = 2,
|
||||
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
|
||||
) -> list[bytes | None]:
|
||||
"""Connect, send each command in sequence, disconnect.
|
||||
"""Send each command over the persistent connection, return responses.
|
||||
|
||||
The BMS only supports a single BLE connection at a time. By connecting
|
||||
only during the active read window and disconnecting immediately after,
|
||||
the mobile app (or any other client) can connect freely between polls.
|
||||
Connects automatically if not already connected. If the connection
|
||||
drops mid-poll, raises BleakError so the coordinator can handle it.
|
||||
"""
|
||||
self._reset()
|
||||
_LOGGER.debug("Polling BMS at %s", self._address)
|
||||
self._drain_queue()
|
||||
|
||||
await self._ensure_connected(ble_device, ble_device_callback)
|
||||
|
||||
client = await establish_connection(
|
||||
BleakClientWithServiceCache,
|
||||
ble_device,
|
||||
self._address,
|
||||
max_attempts=2,
|
||||
ble_device_callback=ble_device_callback,
|
||||
)
|
||||
try:
|
||||
await client.start_notify(RX_CHAR_UUID, self._on_notify)
|
||||
await asyncio.sleep(0.3)
|
||||
return [
|
||||
await self._request(client, cmd, timeout, retries)
|
||||
await self._request(self._client, cmd, timeout, retries)
|
||||
for cmd in commands
|
||||
]
|
||||
finally:
|
||||
try:
|
||||
await client.disconnect()
|
||||
except BleakError:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame reception
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _on_notify(self, _char, data: bytearray) -> None:
|
||||
"""Accumulate BLE notification chunks into complete protocol frames.
|
||||
|
||||
BLE max payload is 20 bytes (default MTU), so a single BMS frame
|
||||
(up to ~50 bytes for 16 cells) arrives across several notifications.
|
||||
We buffer until we can calculate and verify the expected frame length.
|
||||
"""
|
||||
"""Accumulate BLE notification chunks into complete protocol frames."""
|
||||
self._buffer.extend(data)
|
||||
|
||||
# Discard leading garbage until we see a frame start byte
|
||||
@@ -140,7 +177,7 @@ class BmsBluetoothHandler:
|
||||
self._response_queue.put_nowait(frame)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Request / response (private — used inside poll())
|
||||
# Request / response
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _request(
|
||||
@@ -150,19 +187,22 @@ class BmsBluetoothHandler:
|
||||
timeout: float,
|
||||
retries: int,
|
||||
) -> bytes | None:
|
||||
"""Send one command and wait for the response frame, with retries.
|
||||
|
||||
Tries Write With Response first; falls back to Write Without Response
|
||||
if the characteristic rejects it — covers both BMS firmware variants.
|
||||
"""
|
||||
"""Send one command and wait for the response frame, with retries."""
|
||||
for attempt in range(1, retries + 1):
|
||||
# Drain any stale frames before sending a new command
|
||||
if not client.is_connected:
|
||||
_LOGGER.warning("BMS %s connection lost during request", self._address)
|
||||
self._client = None
|
||||
return None
|
||||
|
||||
self._drain_queue()
|
||||
self._buffer.clear()
|
||||
|
||||
try:
|
||||
await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
|
||||
except BleakError:
|
||||
if not client.is_connected:
|
||||
self._client = None
|
||||
return None
|
||||
try:
|
||||
await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
|
||||
except BleakError as exc:
|
||||
@@ -195,32 +235,15 @@ class BmsBluetoothHandler:
|
||||
value: int,
|
||||
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
|
||||
) -> bool:
|
||||
"""Send a MOS control write command and return True on ACK.
|
||||
|
||||
Follows the same connect -> send -> disconnect pattern as poll() so
|
||||
it doesn't interfere with the normal poll cycle.
|
||||
"""
|
||||
self._reset()
|
||||
"""Send a MOS control write command over the persistent connection."""
|
||||
self._drain_queue()
|
||||
command = self._build_mos_command(value)
|
||||
_LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address)
|
||||
|
||||
client = await establish_connection(
|
||||
BleakClientWithServiceCache,
|
||||
ble_device,
|
||||
self._address,
|
||||
max_attempts=2,
|
||||
ble_device_callback=ble_device_callback,
|
||||
)
|
||||
try:
|
||||
await client.start_notify(RX_CHAR_UUID, self._on_notify)
|
||||
await asyncio.sleep(0.3)
|
||||
response = await self._request(client, command, timeout=3.0, retries=2)
|
||||
await self._ensure_connected(ble_device, ble_device_callback)
|
||||
|
||||
response = await self._request(self._client, command, timeout=3.0, retries=2)
|
||||
return response is not None and response[2] == 0x00
|
||||
finally:
|
||||
try:
|
||||
await client.disconnect()
|
||||
except BleakError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _build_mos_command(value: int) -> bytes:
|
||||
@@ -230,9 +253,6 @@ class BmsBluetoothHandler:
|
||||
Checked bytes (per spec): command_code + length + data bytes
|
||||
= 0xE1 + 0x02 + 0x00 + XX
|
||||
Checksum = two's complement of sum, high byte first.
|
||||
|
||||
Verified against spec example:
|
||||
XX=0x02 -> sum=0xE5 -> ~0xE5+1=0xFF1B -> CHK FF 1B
|
||||
"""
|
||||
checked = [0xE1, 0x02, 0x00, value & 0xFF]
|
||||
checksum = (~sum(checked) + 1) & 0xFFFF
|
||||
@@ -252,7 +272,7 @@ class BmsBluetoothHandler:
|
||||
|
||||
Payload byte offsets (frame[4] is payload[0]):
|
||||
0-1 Total voltage uint16 BE /100 -> V
|
||||
2-3 Current int16 BE /100 -> A (positive = charging, negative = discharging)
|
||||
2-3 Current int16 BE /100 -> A (positive = charging)
|
||||
4-5 Residual capacity uint16 BE /100 -> Ah
|
||||
6-7 Nominal capacity uint16 BE /100 -> Ah
|
||||
8-9 Cycle count uint16 BE
|
||||
@@ -287,12 +307,9 @@ class BmsBluetoothHandler:
|
||||
"state_of_charge": p[19],
|
||||
"cell_count": p[21],
|
||||
"temperatures": temperatures,
|
||||
# MOS status
|
||||
"mos_charge_enabled": bool(mos & 0x01),
|
||||
"mos_discharge_enabled": bool(mos & 0x02),
|
||||
# Cell balancing (any cell currently balancing)
|
||||
"balance_active": balance != 0,
|
||||
# Protection flags (bit per event, True = protection triggered)
|
||||
"prot_cell_overvolt": bool(prot & (1 << 0)),
|
||||
"prot_cell_undervolt": bool(prot & (1 << 1)),
|
||||
"prot_pack_overvolt": bool(prot & (1 << 2)),
|
||||
@@ -319,10 +336,8 @@ class BmsBluetoothHandler:
|
||||
"""Parse a 0x04 cell voltage response frame.
|
||||
|
||||
Per spec: frame[3] (the header length byte) = cell_count x 2.
|
||||
The payload contains ONLY the voltage bytes — no count byte.
|
||||
0+ Cell voltages uint16 BE each unit mV /1000 -> V
|
||||
"""
|
||||
count = frame[3] // 2 # header length byte = N_cells x 2
|
||||
count = frame[3] // 2
|
||||
p = frame[_HEADER_LEN:-_TRAILER_LEN]
|
||||
voltages: list[float] = []
|
||||
for i in range(count):
|
||||
|
||||
@@ -5,6 +5,7 @@ import asyncio
|
||||
import logging
|
||||
from datetime import timedelta
|
||||
|
||||
from bleak import BleakError
|
||||
from bleak.backends.device import BLEDevice
|
||||
from homeassistant.components.bluetooth import async_ble_device_from_address
|
||||
from homeassistant.core import HomeAssistant
|
||||
@@ -18,21 +19,19 @@ from .const import CMD_CELL, CMD_GENERAL, CMD_VERSION, DOMAIN
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
# Only mark sensors unavailable after this many *consecutive* failed polls.
|
||||
# Transient BLE misses (device not in cache, ESPHome proxy busy, etc.) return
|
||||
# the last known data instead so the UI doesn't oscillate.
|
||||
_FAILURES_BEFORE_UNAVAILABLE = 5
|
||||
|
||||
# Hard ceiling on the BLE poll operation (connect + commands + disconnect).
|
||||
# With the global lock preventing contention, connections should be fast —
|
||||
# 15 s is generous for 2 commands over a local proxy.
|
||||
# Hard ceiling on a single poll (commands only — no connection overhead
|
||||
# when already connected). Reconnection adds ~5 s on top.
|
||||
_POLL_TIMEOUT = 15
|
||||
|
||||
|
||||
class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
"""Polls the BMS over BLE and distributes data to all sensor entities.
|
||||
|
||||
Uses a connect -> read -> disconnect pattern on every poll so the BMS's
|
||||
single BLE connection slot is free between updates (mobile app access).
|
||||
Holds a **persistent** BLE connection per device. The connection is
|
||||
established on the first poll and kept alive across cycles. If it
|
||||
drops, the next poll automatically reconnects.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -56,12 +55,12 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
self._consecutive_failures = 0
|
||||
# Kept fresh by the BLE advertisement callback registered in __init__.py
|
||||
self._ble_device: BLEDevice | None = None
|
||||
# Shared across all BMS coordinator instances so only one BMS connects
|
||||
# at a time — prevents ESPHome proxy connection slot exhaustion.
|
||||
# Shared across all BMS coordinator instances — serialises BLE
|
||||
# operations so multiple devices don't fight for proxy slots.
|
||||
self._ble_lock = ble_lock or asyncio.Lock()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Device info — shared by sensor, binary_sensor, number platforms
|
||||
# Device info
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
@@ -78,10 +77,11 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def async_setup(self) -> None:
|
||||
"""No-op — no persistent connection to establish."""
|
||||
"""No-op — connection is established lazily on first poll."""
|
||||
|
||||
async def async_teardown(self) -> None:
|
||||
"""No-op — each poll disconnects itself."""
|
||||
"""Disconnect the persistent BLE connection."""
|
||||
await self._handler.disconnect()
|
||||
|
||||
async def async_write_mos(self, value: int) -> None:
|
||||
"""Send a MOS control command to the BMS, then refresh sensor state."""
|
||||
@@ -91,10 +91,13 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
f"BMS ({self.address}) not reachable — cannot send MOS command"
|
||||
)
|
||||
async with self._ble_lock:
|
||||
try:
|
||||
success = await self._handler.write_mos(
|
||||
device, value,
|
||||
ble_device_callback=self._get_ble_device,
|
||||
)
|
||||
except (BleakError, asyncio.TimeoutError) as exc:
|
||||
raise HomeAssistantError(f"MOS command failed: {exc}") from exc
|
||||
if not success:
|
||||
raise HomeAssistantError("BMS did not acknowledge the MOS command")
|
||||
await self.async_request_refresh()
|
||||
@@ -104,25 +107,22 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_ble_device(self) -> BLEDevice | None:
|
||||
"""Return the freshest available BLEDevice reference.
|
||||
|
||||
Prefers the advertisement-callback reference (_ble_device) because it
|
||||
tracks proxy transport path changes. Falls back to the scanner cache.
|
||||
"""
|
||||
return self._ble_device or async_ble_device_from_address(
|
||||
"""Return the freshest available BLEDevice reference."""
|
||||
if self._ble_device is not None:
|
||||
return self._ble_device
|
||||
device = async_ble_device_from_address(
|
||||
self.hass, self.address, connectable=True
|
||||
)
|
||||
if device is not None:
|
||||
_LOGGER.debug("BMS %s found via scanner cache", self.address)
|
||||
return device
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Poll
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _handle_failure(self, reason: str) -> dict:
|
||||
"""On a transient failure, return cached data up to the threshold.
|
||||
|
||||
Only raises UpdateFailed (-> sensors go unavailable) after
|
||||
_FAILURES_BEFORE_UNAVAILABLE consecutive misses.
|
||||
"""
|
||||
"""Return cached data up to the threshold, then go unavailable."""
|
||||
self._consecutive_failures += 1
|
||||
if self._consecutive_failures <= _FAILURES_BEFORE_UNAVAILABLE and self.data:
|
||||
_LOGGER.debug(
|
||||
@@ -135,9 +135,17 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
raise UpdateFailed(reason)
|
||||
|
||||
async def _async_update_data(self) -> dict:
|
||||
"""Connect to the BMS, fetch all data, disconnect."""
|
||||
"""Poll the BMS over the persistent connection."""
|
||||
|
||||
device = self._get_ble_device()
|
||||
if device is None:
|
||||
# Wait up to 5 s for an advertisement (after HA restart / proxy reconnect)
|
||||
_LOGGER.debug("BMS %s not discovered yet, waiting…", self.address)
|
||||
for _ in range(5):
|
||||
await asyncio.sleep(1.0)
|
||||
device = self._get_ble_device()
|
||||
if device is not None:
|
||||
break
|
||||
if device is None:
|
||||
return self._handle_failure(
|
||||
f"BMS ({self.address}) not reachable — check Bluetooth adapter / proxy"
|
||||
@@ -147,8 +155,7 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
if self.hw_version is None:
|
||||
commands.append(CMD_VERSION)
|
||||
|
||||
# Only one BMS polls at a time — prevents proxy connection slot contention.
|
||||
# The timeout wraps only the actual BLE operation, not the lock wait.
|
||||
# Serialise BLE operations across all BMS instances.
|
||||
async with self._ble_lock:
|
||||
try:
|
||||
responses = await asyncio.wait_for(
|
||||
@@ -160,10 +167,12 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
timeout=_POLL_TIMEOUT,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
# Connection might be dead — force reconnect next cycle
|
||||
await self._handler.disconnect()
|
||||
return self._handle_failure(
|
||||
f"BMS poll timed out after {_POLL_TIMEOUT}s"
|
||||
)
|
||||
except Exception as exc:
|
||||
except (BleakError, Exception) as exc:
|
||||
return self._handle_failure(f"BMS poll failed: {exc}")
|
||||
|
||||
general_frame, cell_frame = responses[0], responses[1]
|
||||
@@ -194,9 +203,10 @@ class BmsCoordinator(DataUpdateCoordinator[dict]):
|
||||
data["cell_delta"] = None
|
||||
|
||||
_LOGGER.debug(
|
||||
"BMS data: %.2fV %.2fA %d%% %.2fAh %.3fkWh %d cells",
|
||||
"BMS data: %.2fV %.2fA %d%% %.2fAh %.3fkWh %d cells (connected: %s)",
|
||||
data["voltage"], data["current"], data["state_of_charge"],
|
||||
data["residual_capacity"], data["energy_stored"],
|
||||
len(data["cell_voltages"]),
|
||||
self._handler.is_connected,
|
||||
)
|
||||
return data
|
||||
|
||||
Reference in New Issue
Block a user