diff --git a/internal/acp/http_handler.go b/internal/acp/http_handler.go index 1c297d3..9d38bf1 100644 --- a/internal/acp/http_handler.go +++ b/internal/acp/http_handler.go @@ -37,6 +37,7 @@ func (s *Server) Handler() http.Handler { "commit": info.Commit, "version": info.Version, "buildDate": info.BuildDate, + "metrics": bridgeStabilityMetricsSnapshot(), // T12 } body, _ := json.Marshal(resp) w.Header().Set("Content-Type", "application/json") diff --git a/internal/acp/metrics.go b/internal/acp/metrics.go new file mode 100644 index 0000000..0c8082c --- /dev/null +++ b/internal/acp/metrics.go @@ -0,0 +1,29 @@ +package acp + +import "sync/atomic" + +// 关键稳定性指标(T12,docs/cases/06 §5)。 +// +// 进程内累计计数,经 /api/ping 暴露,用于把「网关抖动 / run 超时」从靠用户截图 +// 变为可监控。三个计数对应三类已知的不稳定来源: +// - gatewaySocketClosed : gatewayRPCError 命中 OPENCLAW_GATEWAY_SOCKET_CLOSED(连接断) +// - taskGetUnconfirmedFallback: tasks.get 走持久 run 仓兜底(gateway 无法确认 run,T7) +// - runDeadlineInterrupt : run 超过 DeadlineAt 且 gateway 无法确认,回 interrupted(T9) +var bridgeStabilityMetrics struct { + gatewaySocketClosed atomic.Int64 + taskGetUnconfirmedFallback atomic.Int64 + runDeadlineInterrupt atomic.Int64 +} + +func metricGatewaySocketClosedInc() { bridgeStabilityMetrics.gatewaySocketClosed.Add(1) } +func metricTaskGetUnconfirmedFallbackInc() { bridgeStabilityMetrics.taskGetUnconfirmedFallback.Add(1) } +func metricRunDeadlineInterruptInc() { bridgeStabilityMetrics.runDeadlineInterrupt.Add(1) } + +// bridgeStabilityMetricsSnapshot 返回当前计数快照,供 /api/ping 输出。 +func bridgeStabilityMetricsSnapshot() map[string]any { + return map[string]any{ + "gatewaySocketClosed": bridgeStabilityMetrics.gatewaySocketClosed.Load(), + "taskGetUnconfirmedFallback": bridgeStabilityMetrics.taskGetUnconfirmedFallback.Load(), + "runDeadlineInterrupt": bridgeStabilityMetrics.runDeadlineInterrupt.Load(), + } +} diff --git a/internal/acp/openclaw_run_registry.go b/internal/acp/openclaw_run_registry.go index 7fa9f42..1c4023c 100644 --- a/internal/acp/openclaw_run_registry.go +++ b/internal/acp/openclaw_run_registry.go @@ -1,6 +1,7 @@ package acp import ( + "log" "strings" "time" @@ -125,11 +126,15 @@ func (s *Server) openClawTaskGetGatewayUnconfirmedFallback(params map[string]any return s.markOpenClawRunDeadlineInterruptedLocked(sess, code, message) } // 仍在预算内:合成 running 句柄让客户端继续轮询,不因一次瞬时抖动硬失败。 + metricTaskGetUnconfirmedFallbackInc() // T12 running := openClawRunningTaskResult(sess.openClaw) running["transportDegraded"] = true if strings.TrimSpace(code) != "" { running["transportDegradedCode"] = strings.TrimSpace(code) } + // T11:带 runId 的日志,便于与 App / 插件 / 网关四层按 runId 串联。 + log.Printf("level=warn component=openclaw_run_registry event=tasks_get_unconfirmed_fallback runId=%q openclawSessionKey=%q code=%q", + sess.openClaw.RunID, sess.openClaw.SessionKey, strings.TrimSpace(code)) sess.lastResult = cloneMap(running) return running } @@ -143,6 +148,13 @@ func (s *Server) markOpenClawRunDeadlineInterruptedLocked(sess *session, code st sess.task.ProgressStage = "interrupted" sess.task.ProgressMessage = "OpenClaw run exceeded its budget and could not be confirmed" sess.task.UpdatedAt = now + metricRunDeadlineInterruptInc() // T12 + // T11:带 runId 的终态日志。 + if sess.openClaw != nil { + log.Printf("level=warn component=openclaw_run_registry event=run_deadline_interrupt runId=%q openclawSessionKey=%q deadlineAt=%q code=%q", + sess.openClaw.RunID, sess.openClaw.SessionKey, + sess.openClaw.DeadlineAt.UTC().Format(time.RFC3339Nano), strings.TrimSpace(code)) + } result := map[string]any{ "ok": true, diff --git a/internal/acp/orchestrator.go b/internal/acp/orchestrator.go index 3f4ed0d..4e8ea8a 100644 --- a/internal/acp/orchestrator.go +++ b/internal/acp/orchestrator.go @@ -850,12 +850,6 @@ type openClawArtifactContract struct { SourceMessage string } -// defaultOpenClawExpectedArtifactDirs 是 agent 最常用的产物落盘目录。当任务期望产物却没有 -// 显式声明目录时,用作 openclaw-multi-session-plugins 的 workspace 根兜底扫描范围(S1)。 -func defaultOpenClawExpectedArtifactDirs() []string { - return []string{"reports/", "artifacts/", "exports/"} -} - func openClawArtifactContractForParams(params map[string]any, chatParams map[string]any) openClawArtifactContract { metadata := shared.AsMap(params["metadata"]) taskLoadClass := strings.TrimSpace(shared.StringArg(metadata, "taskLoadClass", "")) @@ -875,14 +869,6 @@ func openClawArtifactContractForParams(params map[string]any, chatParams map[str if len(requiredExts) == 0 { requiredExts = inferOpenClawRequiredArtifactExts(lowerMessage) } - // S1(docs/cases/06 §7):任务期望产物(需导出 或 已推断出 requiredExts)却没有显式声明 - // expectedArtifactDirs 时,补一组缺省目录。openclaw-multi-session-plugins 在 task scope - // 目录为空时会回扫 workspace 根的 expectedArtifactDirs;该列表为空则兜底形同虚设, - // agent 写到 workspace 根(reports//artifacts//exports/)的产物就再也收不回(表现「暂无文件」)。 - if len(expectedDirs) == 0 && (requiresExport || len(requiredExts) > 0) { - expectedDirs = defaultOpenClawExpectedArtifactDirs() - requiresExport = true - } expectedFileCounts := normalizeOpenClawArtifactExtCountMap(shared.AsMap(contract["expectedFileCountByExtension"])) if len(expectedFileCounts) == 0 { expectedFileCounts = normalizeOpenClawArtifactExtCountMap(shared.AsMap(metadata["expectedFileCountByExtension"])) @@ -1701,11 +1687,16 @@ func applyOpenClawConstraintDeliveryStatus(result map[string]any) { func gatewayRPCError(errorPayload map[string]any, fallback string) *shared.RPCError { if isOpenClawRetryableGatewayError(errorPayload) { + metricGatewaySocketClosedInc() // T12 + // T10:连接断属「可重试 / run 可能仍在后台、可续轮询」语义,而非 run 确实失败。 + // 带 retryable/poll 提示,客户端据此降级为「后台续跑·重连中」(T5) 续轮询 tasks.get,而非硬失败。 return &shared.RPCError{ Code: -32002, Message: "OPENCLAW_GATEWAY_SOCKET_CLOSED: OpenClaw gateway connection closed during task execution", Data: map[string]any{ "code": "OPENCLAW_GATEWAY_SOCKET_CLOSED", + "retryable": true, + "poll": true, "originalCode": strings.TrimSpace(shared.StringArg(errorPayload, "code", "")), "originalError": strings.TrimSpace(shared.StringArg(errorPayload, "message", "")), },