fix(acp): T10/T11/T12 observability + error semantics; revert S1 (broke main)

T10: gatewayRPCError marks OPENCLAW_GATEWAY_SOCKET_CLOSED with retryable=true,
poll=true so the client degrades to "background/reconnecting" + keeps polling
instead of hard-failing (feeds App T5).
T11: runId-tagged warn logs at the tasks.get unconfirmed-fallback and
run-deadline-interrupt sites, so a runId can be joined across App→bridge→plugin→gateway.
T12: process-level stability counters (gatewaySocketClosed, taskGetUnconfirmedFallback,
runDeadlineInterrupt) exposed via /api/ping.metrics.

Revert S1 (default expectedArtifactDirs): it set requiresExport=true / default dirs
for any artifact-inferring task, which made a gateway run that succeeds with NO
artifact hang "waiting for artifact export" (TestHTTPHandlerGatewayOpenClawHandlesFive
ConcurrentE2ECases + ...WithoutPromptHeuristic went red). The blocking is tied to
expectedArtifactDirs presence in openClawTaskGetRequiresArtifactExport; decoupling
scan-hint from block-on-export needs a careful, separately-tested change. Reverted to
keep main green; S1 to be redesigned (see docs/cases/06 §7).

Full internal/acp suite green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Haitao Pan 2026-06-27 06:43:21 +08:00
parent 02808934c8
commit fa9cc78add
4 changed files with 47 additions and 14 deletions

View File

@ -37,6 +37,7 @@ func (s *Server) Handler() http.Handler {
"commit": info.Commit,
"version": info.Version,
"buildDate": info.BuildDate,
"metrics": bridgeStabilityMetricsSnapshot(), // T12
}
body, _ := json.Marshal(resp)
w.Header().Set("Content-Type", "application/json")

29
internal/acp/metrics.go Normal file
View File

@ -0,0 +1,29 @@
package acp
import "sync/atomic"
// 关键稳定性指标T12docs/cases/06 §5
//
// 进程内累计计数,经 /api/ping 暴露,用于把「网关抖动 / run 超时」从靠用户截图
// 变为可监控。三个计数对应三类已知的不稳定来源:
// - gatewaySocketClosed : gatewayRPCError 命中 OPENCLAW_GATEWAY_SOCKET_CLOSED连接断
// - taskGetUnconfirmedFallback: tasks.get 走持久 run 仓兜底gateway 无法确认 runT7
// - runDeadlineInterrupt : run 超过 DeadlineAt 且 gateway 无法确认,回 interruptedT9
var bridgeStabilityMetrics struct {
gatewaySocketClosed atomic.Int64
taskGetUnconfirmedFallback atomic.Int64
runDeadlineInterrupt atomic.Int64
}
func metricGatewaySocketClosedInc() { bridgeStabilityMetrics.gatewaySocketClosed.Add(1) }
func metricTaskGetUnconfirmedFallbackInc() { bridgeStabilityMetrics.taskGetUnconfirmedFallback.Add(1) }
func metricRunDeadlineInterruptInc() { bridgeStabilityMetrics.runDeadlineInterrupt.Add(1) }
// bridgeStabilityMetricsSnapshot 返回当前计数快照,供 /api/ping 输出。
func bridgeStabilityMetricsSnapshot() map[string]any {
return map[string]any{
"gatewaySocketClosed": bridgeStabilityMetrics.gatewaySocketClosed.Load(),
"taskGetUnconfirmedFallback": bridgeStabilityMetrics.taskGetUnconfirmedFallback.Load(),
"runDeadlineInterrupt": bridgeStabilityMetrics.runDeadlineInterrupt.Load(),
}
}

View File

@ -1,6 +1,7 @@
package acp
import (
"log"
"strings"
"time"
@ -125,11 +126,15 @@ func (s *Server) openClawTaskGetGatewayUnconfirmedFallback(params map[string]any
return s.markOpenClawRunDeadlineInterruptedLocked(sess, code, message)
}
// 仍在预算内:合成 running 句柄让客户端继续轮询,不因一次瞬时抖动硬失败。
metricTaskGetUnconfirmedFallbackInc() // T12
running := openClawRunningTaskResult(sess.openClaw)
running["transportDegraded"] = true
if strings.TrimSpace(code) != "" {
running["transportDegradedCode"] = strings.TrimSpace(code)
}
// T11带 runId 的日志,便于与 App / 插件 / 网关四层按 runId 串联。
log.Printf("level=warn component=openclaw_run_registry event=tasks_get_unconfirmed_fallback runId=%q openclawSessionKey=%q code=%q",
sess.openClaw.RunID, sess.openClaw.SessionKey, strings.TrimSpace(code))
sess.lastResult = cloneMap(running)
return running
}
@ -143,6 +148,13 @@ func (s *Server) markOpenClawRunDeadlineInterruptedLocked(sess *session, code st
sess.task.ProgressStage = "interrupted"
sess.task.ProgressMessage = "OpenClaw run exceeded its budget and could not be confirmed"
sess.task.UpdatedAt = now
metricRunDeadlineInterruptInc() // T12
// T11带 runId 的终态日志。
if sess.openClaw != nil {
log.Printf("level=warn component=openclaw_run_registry event=run_deadline_interrupt runId=%q openclawSessionKey=%q deadlineAt=%q code=%q",
sess.openClaw.RunID, sess.openClaw.SessionKey,
sess.openClaw.DeadlineAt.UTC().Format(time.RFC3339Nano), strings.TrimSpace(code))
}
result := map[string]any{
"ok": true,

View File

@ -850,12 +850,6 @@ type openClawArtifactContract struct {
SourceMessage string
}
// defaultOpenClawExpectedArtifactDirs 是 agent 最常用的产物落盘目录。当任务期望产物却没有
// 显式声明目录时,用作 openclaw-multi-session-plugins 的 workspace 根兜底扫描范围S1
func defaultOpenClawExpectedArtifactDirs() []string {
return []string{"reports/", "artifacts/", "exports/"}
}
func openClawArtifactContractForParams(params map[string]any, chatParams map[string]any) openClawArtifactContract {
metadata := shared.AsMap(params["metadata"])
taskLoadClass := strings.TrimSpace(shared.StringArg(metadata, "taskLoadClass", ""))
@ -875,14 +869,6 @@ func openClawArtifactContractForParams(params map[string]any, chatParams map[str
if len(requiredExts) == 0 {
requiredExts = inferOpenClawRequiredArtifactExts(lowerMessage)
}
// S1docs/cases/06 §7任务期望产物需导出 或 已推断出 requiredExts却没有显式声明
// expectedArtifactDirs 时补一组缺省目录。openclaw-multi-session-plugins 在 task scope
// 目录为空时会回扫 workspace 根的 expectedArtifactDirs该列表为空则兜底形同虚设
// agent 写到 workspace 根reports//artifacts//exports/)的产物就再也收不回(表现「暂无文件」)。
if len(expectedDirs) == 0 && (requiresExport || len(requiredExts) > 0) {
expectedDirs = defaultOpenClawExpectedArtifactDirs()
requiresExport = true
}
expectedFileCounts := normalizeOpenClawArtifactExtCountMap(shared.AsMap(contract["expectedFileCountByExtension"]))
if len(expectedFileCounts) == 0 {
expectedFileCounts = normalizeOpenClawArtifactExtCountMap(shared.AsMap(metadata["expectedFileCountByExtension"]))
@ -1701,11 +1687,16 @@ func applyOpenClawConstraintDeliveryStatus(result map[string]any) {
func gatewayRPCError(errorPayload map[string]any, fallback string) *shared.RPCError {
if isOpenClawRetryableGatewayError(errorPayload) {
metricGatewaySocketClosedInc() // T12
// T10连接断属「可重试 / run 可能仍在后台、可续轮询」语义,而非 run 确实失败。
// 带 retryable/poll 提示,客户端据此降级为「后台续跑·重连中」(T5) 续轮询 tasks.get而非硬失败。
return &shared.RPCError{
Code: -32002,
Message: "OPENCLAW_GATEWAY_SOCKET_CLOSED: OpenClaw gateway connection closed during task execution",
Data: map[string]any{
"code": "OPENCLAW_GATEWAY_SOCKET_CLOSED",
"retryable": true,
"poll": true,
"originalCode": strings.TrimSpace(shared.StringArg(errorPayload, "code", "")),
"originalError": strings.TrimSpace(shared.StringArg(errorPayload, "message", "")),
},