Switch to persistent BLE connection model

The connect/disconnect-every-cycle approach caused ~50% failure rate
over 48h — each of the ~2880 daily connection attempts per device had
a significant chance of failure through ESPHome proxies.

New model (same as the user's Android app):
- Connect once, keep the connection alive across poll cycles
- _ensure_connected() reconnects automatically if the link drops
- _on_disconnect() callback detects unexpected disconnections
- On timeout, force-disconnect so next cycle gets a clean reconnect
- Polls now only send commands (no connection overhead) — expected
  completion in <1s instead of 10-25s

Connection lifecycle:
  startup → first poll → _ensure_connected() → persistent
  drop detected → next poll → _ensure_connected() → reconnected
  shutdown → async_teardown() → disconnect()

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:46:20 +02:00
parent dcc528b96a
commit b8bee14839
2 changed files with 133 additions and 108 deletions
@@ -29,15 +29,72 @@ _TRAILER_LEN = 3
class BmsBluetoothHandler:
"""Protocol framing and parsing for a Xiaoxiang BMS device.
Designed for a connect -> poll -> disconnect pattern: the BMS only allows
one simultaneous BLE connection, so we hold it only for the duration of
a single data fetch and release it immediately after.
Holds a **persistent** BLE connection. The connection is established on
the first poll and kept alive across cycles. If it drops, the next poll
automatically reconnects. This avoids the massive overhead of
connect/disconnect every cycle through ESPHome BLE proxies.
"""
def __init__(self, address: str) -> None:
self._address = address
self._buffer = bytearray()
self._response_queue: asyncio.Queue[bytes] = asyncio.Queue()
self._client: BleakClientWithServiceCache | None = None
# ------------------------------------------------------------------
# Connection management
# ------------------------------------------------------------------
@property
def is_connected(self) -> bool:
return self._client is not None and self._client.is_connected
def _on_disconnect(self, _client) -> None:
"""Called by bleak when the connection drops unexpectedly."""
_LOGGER.debug("BMS %s disconnected", self._address)
self._client = None
async def _ensure_connected(
self,
ble_device: BLEDevice,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> None:
"""Connect if not already connected."""
if self.is_connected:
return
# Clean up stale client
if self._client is not None:
try:
await self._client.disconnect()
except BleakError:
pass
self._client = None
self._reset()
_LOGGER.debug("Connecting to BMS %s", self._address)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
disconnected_callback=self._on_disconnect,
max_attempts=2,
ble_device_callback=ble_device_callback,
)
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
self._client = client
_LOGGER.info("BMS %s connected", self._address)
async def disconnect(self) -> None:
"""Explicitly close the connection (used during teardown)."""
if self._client is not None:
try:
await self._client.disconnect()
except BleakError:
pass
self._client = None
# ------------------------------------------------------------------
# Internal helpers
@@ -52,12 +109,12 @@ class BmsBluetoothHandler:
break
def _reset(self) -> None:
"""Clear all transient state before a new connection."""
"""Clear all transient state."""
self._buffer.clear()
self._drain_queue()
# ------------------------------------------------------------------
# High-level poll — the only entry point the coordinator needs
# High-level poll
# ------------------------------------------------------------------
async def poll(
@@ -68,46 +125,26 @@ class BmsBluetoothHandler:
retries: int = 2,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> list[bytes | None]:
"""Connect, send each command in sequence, disconnect.
"""Send each command over the persistent connection, return responses.
The BMS only supports a single BLE connection at a time. By connecting
only during the active read window and disconnecting immediately after,
the mobile app (or any other client) can connect freely between polls.
Connects automatically if not already connected. If the connection
drops mid-poll, raises BleakError so the coordinator can handle it.
"""
self._reset()
_LOGGER.debug("Polling BMS at %s", self._address)
self._drain_queue()
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=2,
ble_device_callback=ble_device_callback,
)
try:
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
return [
await self._request(client, cmd, timeout, retries)
for cmd in commands
]
finally:
try:
await client.disconnect()
except BleakError:
pass
await self._ensure_connected(ble_device, ble_device_callback)
return [
await self._request(self._client, cmd, timeout, retries)
for cmd in commands
]
# ------------------------------------------------------------------
# Frame reception
# ------------------------------------------------------------------
def _on_notify(self, _char, data: bytearray) -> None:
"""Accumulate BLE notification chunks into complete protocol frames.
BLE max payload is 20 bytes (default MTU), so a single BMS frame
(up to ~50 bytes for 16 cells) arrives across several notifications.
We buffer until we can calculate and verify the expected frame length.
"""
"""Accumulate BLE notification chunks into complete protocol frames."""
self._buffer.extend(data)
# Discard leading garbage until we see a frame start byte
@@ -140,7 +177,7 @@ class BmsBluetoothHandler:
self._response_queue.put_nowait(frame)
# ------------------------------------------------------------------
# Request / response (private — used inside poll())
# Request / response
# ------------------------------------------------------------------
async def _request(
@@ -150,19 +187,22 @@ class BmsBluetoothHandler:
timeout: float,
retries: int,
) -> bytes | None:
"""Send one command and wait for the response frame, with retries.
Tries Write With Response first; falls back to Write Without Response
if the characteristic rejects it — covers both BMS firmware variants.
"""
"""Send one command and wait for the response frame, with retries."""
for attempt in range(1, retries + 1):
# Drain any stale frames before sending a new command
if not client.is_connected:
_LOGGER.warning("BMS %s connection lost during request", self._address)
self._client = None
return None
self._drain_queue()
self._buffer.clear()
try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
except BleakError:
if not client.is_connected:
self._client = None
return None
try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
except BleakError as exc:
@@ -195,32 +235,15 @@ class BmsBluetoothHandler:
value: int,
ble_device_callback: Callable[[], BLEDevice | None] | None = None,
) -> bool:
"""Send a MOS control write command and return True on ACK.
Follows the same connect -> send -> disconnect pattern as poll() so
it doesn't interfere with the normal poll cycle.
"""
self._reset()
"""Send a MOS control write command over the persistent connection."""
self._drain_queue()
command = self._build_mos_command(value)
_LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=2,
ble_device_callback=ble_device_callback,
)
try:
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.3)
response = await self._request(client, command, timeout=3.0, retries=2)
return response is not None and response[2] == 0x00
finally:
try:
await client.disconnect()
except BleakError:
pass
await self._ensure_connected(ble_device, ble_device_callback)
response = await self._request(self._client, command, timeout=3.0, retries=2)
return response is not None and response[2] == 0x00
@staticmethod
def _build_mos_command(value: int) -> bytes:
@@ -230,9 +253,6 @@ class BmsBluetoothHandler:
Checked bytes (per spec): command_code + length + data bytes
= 0xE1 + 0x02 + 0x00 + XX
Checksum = two's complement of sum, high byte first.
Verified against spec example:
XX=0x02 -> sum=0xE5 -> ~0xE5+1=0xFF1B -> CHK FF 1B
"""
checked = [0xE1, 0x02, 0x00, value & 0xFF]
checksum = (~sum(checked) + 1) & 0xFFFF
@@ -252,7 +272,7 @@ class BmsBluetoothHandler:
Payload byte offsets (frame[4] is payload[0]):
0-1 Total voltage uint16 BE /100 -> V
2-3 Current int16 BE /100 -> A (positive = charging, negative = discharging)
2-3 Current int16 BE /100 -> A (positive = charging)
4-5 Residual capacity uint16 BE /100 -> Ah
6-7 Nominal capacity uint16 BE /100 -> Ah
8-9 Cycle count uint16 BE
@@ -287,12 +307,9 @@ class BmsBluetoothHandler:
"state_of_charge": p[19],
"cell_count": p[21],
"temperatures": temperatures,
# MOS status
"mos_charge_enabled": bool(mos & 0x01),
"mos_discharge_enabled": bool(mos & 0x02),
# Cell balancing (any cell currently balancing)
"balance_active": balance != 0,
# Protection flags (bit per event, True = protection triggered)
"prot_cell_overvolt": bool(prot & (1 << 0)),
"prot_cell_undervolt": bool(prot & (1 << 1)),
"prot_pack_overvolt": bool(prot & (1 << 2)),
@@ -319,10 +336,8 @@ class BmsBluetoothHandler:
"""Parse a 0x04 cell voltage response frame.
Per spec: frame[3] (the header length byte) = cell_count x 2.
The payload contains ONLY the voltage bytes — no count byte.
0+ Cell voltages uint16 BE each unit mV /1000 -> V
"""
count = frame[3] // 2 # header length byte = N_cells x 2
count = frame[3] // 2
p = frame[_HEADER_LEN:-_TRAILER_LEN]
voltages: list[float] = []
for i in range(count):