Refactor BLE layer for 24/7 reliability

- Replace raw BleakClient with establish_connection from
  bleak-retry-connector (retries, GATT service cache, proxy-aware)
- Replace fragile asyncio.Event with asyncio.Queue for response frames,
  drain stale data on each connection to prevent cross-cycle leakage
- Register BLE advertisement callback to keep BLEDevice reference fresh
  across ESPHome proxy path changes
- Remove asyncio.sleep(2) device lookup hack
- Increase poll timeout floor from 10s to 20s
- Increase failure tolerance from 3 to 5 consecutive misses
- Bump default poll interval to 30s, min to 15s (halves connection churn)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 09:36:08 +02:00
parent c00d9b66c2
commit 1520ed3c0f
4 changed files with 131 additions and 72 deletions
@@ -5,8 +5,9 @@ import asyncio
import logging
import struct
from bleak import BleakClient, BleakError
from bleak import BleakError
from bleak.backends.device import BLEDevice
from bleak_retry_connector import establish_connection, BleakClientWithServiceCache
from .const import (
FRAME_END,
@@ -19,7 +20,7 @@ _LOGGER = logging.getLogger(__name__)
# Full frame layout:
# [0xDD] [CMD] [STATUS] [PAYLOAD_LEN] [PAYLOAD...] [CHK_HI] [CHK_LO] [0x77]
# Header = 4 bytes, trailer = 3 bytes (checksum × 2 + end marker)
# Header = 4 bytes, trailer = 3 bytes (checksum x 2 + end marker)
_HEADER_LEN = 4
_TRAILER_LEN = 3
@@ -27,7 +28,7 @@ _TRAILER_LEN = 3
class BmsBluetoothHandler:
"""Protocol framing and parsing for a Xiaoxiang BMS device.
Designed for a connect poll disconnect pattern: the BMS only allows
Designed for a connect -> poll -> disconnect pattern: the BMS only allows
one simultaneous BLE connection, so we hold it only for the duration of
a single data fetch and release it immediately after.
"""
@@ -35,9 +36,24 @@ class BmsBluetoothHandler:
def __init__(self, address: str) -> None:
self._address = address
self._buffer = bytearray()
self._response_event = asyncio.Event()
self._response_data: bytes | None = None
self._lock = asyncio.Lock()
self._response_queue: asyncio.Queue[bytes] = asyncio.Queue()
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _drain_queue(self) -> None:
"""Discard any stale frames left in the queue from a prior cycle."""
while not self._response_queue.empty():
try:
self._response_queue.get_nowait()
except asyncio.QueueEmpty:
break
def _reset(self) -> None:
"""Clear all transient state before a new connection."""
self._buffer.clear()
self._drain_queue()
# ------------------------------------------------------------------
# High-level poll — the only entry point the coordinator needs
@@ -47,7 +63,7 @@ class BmsBluetoothHandler:
self,
ble_device: BLEDevice,
commands: list[bytes],
timeout: float = 3.0,
timeout: float = 5.0,
retries: int = 3,
) -> list[bytes | None]:
"""Connect, send each command in sequence, disconnect.
@@ -56,13 +72,17 @@ class BmsBluetoothHandler:
only during the active read window and disconnecting immediately after,
the mobile app (or any other client) can connect freely between polls.
"""
self._reset()
_LOGGER.debug("Polling BMS at %s", self._address)
client = BleakClient(ble_device)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=3,
)
try:
await client.connect()
await client.start_notify(RX_CHAR_UUID, self._on_notify)
# Give the BMS a moment to register the subscription before
# we start sending commands
await asyncio.sleep(0.3)
return [
await self._request(client, cmd, timeout, retries)
@@ -71,9 +91,8 @@ class BmsBluetoothHandler:
finally:
try:
await client.disconnect()
except Exception:
except BleakError:
pass
self._buffer.clear()
# ------------------------------------------------------------------
# Frame reception
@@ -115,8 +134,7 @@ class BmsBluetoothHandler:
return
_LOGGER.debug("BMS frame received (cmd=0x%02X, len=%d)", frame[1], payload_len)
self._response_data = frame
self._response_event.set()
self._response_queue.put_nowait(frame)
# ------------------------------------------------------------------
# Request / response (private — used inside poll())
@@ -124,7 +142,7 @@ class BmsBluetoothHandler:
async def _request(
self,
client: BleakClient,
client: BleakClientWithServiceCache,
command: bytes,
timeout: float,
retries: int,
@@ -134,30 +152,33 @@ class BmsBluetoothHandler:
Tries Write With Response first; falls back to Write Without Response
if the characteristic rejects it — covers both BMS firmware variants.
"""
async with self._lock:
for attempt in range(1, retries + 1):
self._response_event.clear()
self._response_data = None
try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
except BleakError:
try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
except BleakError as exc:
_LOGGER.error("BLE write failed (attempt %d/%d): %s",
attempt, retries, exc)
if attempt < retries:
await asyncio.sleep(0.3)
continue
for attempt in range(1, retries + 1):
# Drain any stale frames before sending a new command
self._drain_queue()
self._buffer.clear()
try:
await client.write_gatt_char(TX_CHAR_UUID, command, response=True)
except BleakError:
try:
await asyncio.wait_for(self._response_event.wait(), timeout)
return self._response_data
except asyncio.TimeoutError:
_LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)",
command.hex(), attempt, retries)
await client.write_gatt_char(TX_CHAR_UUID, command, response=False)
except BleakError as exc:
_LOGGER.warning("BLE write failed (attempt %d/%d): %s",
attempt, retries, exc)
if attempt < retries:
await asyncio.sleep(0.3)
await asyncio.sleep(0.5)
continue
try:
frame = await asyncio.wait_for(
self._response_queue.get(), timeout
)
return frame
except asyncio.TimeoutError:
_LOGGER.warning("BMS timeout (cmd=0x%s, attempt %d/%d)",
command.hex(), attempt, retries)
if attempt < retries:
await asyncio.sleep(0.5)
return None
@@ -168,25 +189,29 @@ class BmsBluetoothHandler:
async def write_mos(self, ble_device: BLEDevice, value: int) -> bool:
"""Send a MOS control write command and return True on ACK.
Follows the same connect send disconnect pattern as poll() so
Follows the same connect -> send -> disconnect pattern as poll() so
it doesn't interfere with the normal poll cycle.
"""
self._reset()
command = self._build_mos_command(value)
_LOGGER.debug("Writing MOS value 0x%02X to BMS at %s", value, self._address)
client = BleakClient(ble_device)
client = await establish_connection(
BleakClientWithServiceCache,
ble_device,
self._address,
max_attempts=3,
)
try:
await client.connect()
await client.start_notify(RX_CHAR_UUID, self._on_notify)
await asyncio.sleep(0.5)
response = await self._request(client, command, timeout=3.0, retries=2)
# Response: DD E1 00 00 CHK_H CHK_L 77 (status byte 0x00 = OK)
await asyncio.sleep(0.3)
response = await self._request(client, command, timeout=5.0, retries=2)
return response is not None and response[2] == 0x00
finally:
try:
await client.disconnect()
except Exception:
except BleakError:
pass
self._buffer.clear()
@staticmethod
def _build_mos_command(value: int) -> bytes:
@@ -198,7 +223,7 @@ class BmsBluetoothHandler:
Checksum = two's complement of sum, high byte first.
Verified against spec example:
XX=0x02 sum=0xE5 ~0xE5+1=0xFF1B CHK FF 1B
XX=0x02 -> sum=0xE5 -> ~0xE5+1=0xFF1B -> CHK FF 1B
"""
checked = [0xE1, 0x02, 0x00, value & 0xFF]
checksum = (~sum(checked) + 1) & 0xFFFF
@@ -217,10 +242,10 @@ class BmsBluetoothHandler:
"""Parse a 0x03 general info response frame.
Payload byte offsets (frame[4] is payload[0]):
0-1 Total voltage uint16 BE ÷100 V
2-3 Current int16 BE ÷100 A (positive = charging, negative = discharging)
4-5 Residual capacity uint16 BE ÷100 Ah
6-7 Nominal capacity uint16 BE ÷100 Ah
0-1 Total voltage uint16 BE /100 -> V
2-3 Current int16 BE /100 -> A (positive = charging, negative = discharging)
4-5 Residual capacity uint16 BE /100 -> Ah
6-7 Nominal capacity uint16 BE /100 -> Ah
8-9 Cycle count uint16 BE
10-11 Production date (ignored)
12-15 Balance status (ignored)
@@ -230,7 +255,7 @@ class BmsBluetoothHandler:
20 MOS status uint8
21 Cell count uint8
22 Temp probe count uint8
23+ Temperatures uint16 BE each (raw 2731) ÷ 10 → °C
23+ Temperatures uint16 BE each (raw - 2731) / 10 -> C
"""
p = frame[_HEADER_LEN:-_TRAILER_LEN]
@@ -284,11 +309,11 @@ class BmsBluetoothHandler:
def parse_cell_info(frame: bytes) -> dict:
"""Parse a 0x04 cell voltage response frame.
Per spec: frame[3] (the header length byte) = cell_count × 2.
Per spec: frame[3] (the header length byte) = cell_count x 2.
The payload contains ONLY the voltage bytes — no count byte.
0+ Cell voltages uint16 BE each unit mV ÷1000 V
0+ Cell voltages uint16 BE each unit mV /1000 -> V
"""
count = frame[3] // 2 # header length byte = N_cells × 2
count = frame[3] // 2 # header length byte = N_cells x 2
p = frame[_HEADER_LEN:-_TRAILER_LEN]
voltages: list[float] = []
for i in range(count):