fix(acp): T10/T11/T12 observability + error semantics; revert S1 (broke main)
T10: gatewayRPCError marks OPENCLAW_GATEWAY_SOCKET_CLOSED with retryable=true, poll=true so the client degrades to "background/reconnecting" + keeps polling instead of hard-failing (feeds App T5). T11: runId-tagged warn logs at the tasks.get unconfirmed-fallback and run-deadline-interrupt sites, so a runId can be joined across App→bridge→plugin→gateway. T12: process-level stability counters (gatewaySocketClosed, taskGetUnconfirmedFallback, runDeadlineInterrupt) exposed via /api/ping.metrics. Revert S1 (default expectedArtifactDirs): it set requiresExport=true / default dirs for any artifact-inferring task, which made a gateway run that succeeds with NO artifact hang "waiting for artifact export" (TestHTTPHandlerGatewayOpenClawHandlesFive ConcurrentE2ECases + ...WithoutPromptHeuristic went red). The blocking is tied to expectedArtifactDirs presence in openClawTaskGetRequiresArtifactExport; decoupling scan-hint from block-on-export needs a careful, separately-tested change. Reverted to keep main green; S1 to be redesigned (see docs/cases/06 §7). Full internal/acp suite green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
02808934c8
commit
fa9cc78add
@ -37,6 +37,7 @@ func (s *Server) Handler() http.Handler {
|
||||
"commit": info.Commit,
|
||||
"version": info.Version,
|
||||
"buildDate": info.BuildDate,
|
||||
"metrics": bridgeStabilityMetricsSnapshot(), // T12
|
||||
}
|
||||
body, _ := json.Marshal(resp)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
29
internal/acp/metrics.go
Normal file
29
internal/acp/metrics.go
Normal file
@ -0,0 +1,29 @@
|
||||
package acp
|
||||
|
||||
import "sync/atomic"
|
||||
|
||||
// 关键稳定性指标(T12,docs/cases/06 §5)。
|
||||
//
|
||||
// 进程内累计计数,经 /api/ping 暴露,用于把「网关抖动 / run 超时」从靠用户截图
|
||||
// 变为可监控。三个计数对应三类已知的不稳定来源:
|
||||
// - gatewaySocketClosed : gatewayRPCError 命中 OPENCLAW_GATEWAY_SOCKET_CLOSED(连接断)
|
||||
// - taskGetUnconfirmedFallback: tasks.get 走持久 run 仓兜底(gateway 无法确认 run,T7)
|
||||
// - runDeadlineInterrupt : run 超过 DeadlineAt 且 gateway 无法确认,回 interrupted(T9)
|
||||
var bridgeStabilityMetrics struct {
|
||||
gatewaySocketClosed atomic.Int64
|
||||
taskGetUnconfirmedFallback atomic.Int64
|
||||
runDeadlineInterrupt atomic.Int64
|
||||
}
|
||||
|
||||
func metricGatewaySocketClosedInc() { bridgeStabilityMetrics.gatewaySocketClosed.Add(1) }
|
||||
func metricTaskGetUnconfirmedFallbackInc() { bridgeStabilityMetrics.taskGetUnconfirmedFallback.Add(1) }
|
||||
func metricRunDeadlineInterruptInc() { bridgeStabilityMetrics.runDeadlineInterrupt.Add(1) }
|
||||
|
||||
// bridgeStabilityMetricsSnapshot 返回当前计数快照,供 /api/ping 输出。
|
||||
func bridgeStabilityMetricsSnapshot() map[string]any {
|
||||
return map[string]any{
|
||||
"gatewaySocketClosed": bridgeStabilityMetrics.gatewaySocketClosed.Load(),
|
||||
"taskGetUnconfirmedFallback": bridgeStabilityMetrics.taskGetUnconfirmedFallback.Load(),
|
||||
"runDeadlineInterrupt": bridgeStabilityMetrics.runDeadlineInterrupt.Load(),
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package acp
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -125,11 +126,15 @@ func (s *Server) openClawTaskGetGatewayUnconfirmedFallback(params map[string]any
|
||||
return s.markOpenClawRunDeadlineInterruptedLocked(sess, code, message)
|
||||
}
|
||||
// 仍在预算内:合成 running 句柄让客户端继续轮询,不因一次瞬时抖动硬失败。
|
||||
metricTaskGetUnconfirmedFallbackInc() // T12
|
||||
running := openClawRunningTaskResult(sess.openClaw)
|
||||
running["transportDegraded"] = true
|
||||
if strings.TrimSpace(code) != "" {
|
||||
running["transportDegradedCode"] = strings.TrimSpace(code)
|
||||
}
|
||||
// T11:带 runId 的日志,便于与 App / 插件 / 网关四层按 runId 串联。
|
||||
log.Printf("level=warn component=openclaw_run_registry event=tasks_get_unconfirmed_fallback runId=%q openclawSessionKey=%q code=%q",
|
||||
sess.openClaw.RunID, sess.openClaw.SessionKey, strings.TrimSpace(code))
|
||||
sess.lastResult = cloneMap(running)
|
||||
return running
|
||||
}
|
||||
@ -143,6 +148,13 @@ func (s *Server) markOpenClawRunDeadlineInterruptedLocked(sess *session, code st
|
||||
sess.task.ProgressStage = "interrupted"
|
||||
sess.task.ProgressMessage = "OpenClaw run exceeded its budget and could not be confirmed"
|
||||
sess.task.UpdatedAt = now
|
||||
metricRunDeadlineInterruptInc() // T12
|
||||
// T11:带 runId 的终态日志。
|
||||
if sess.openClaw != nil {
|
||||
log.Printf("level=warn component=openclaw_run_registry event=run_deadline_interrupt runId=%q openclawSessionKey=%q deadlineAt=%q code=%q",
|
||||
sess.openClaw.RunID, sess.openClaw.SessionKey,
|
||||
sess.openClaw.DeadlineAt.UTC().Format(time.RFC3339Nano), strings.TrimSpace(code))
|
||||
}
|
||||
|
||||
result := map[string]any{
|
||||
"ok": true,
|
||||
|
||||
@ -850,12 +850,6 @@ type openClawArtifactContract struct {
|
||||
SourceMessage string
|
||||
}
|
||||
|
||||
// defaultOpenClawExpectedArtifactDirs 是 agent 最常用的产物落盘目录。当任务期望产物却没有
|
||||
// 显式声明目录时,用作 openclaw-multi-session-plugins 的 workspace 根兜底扫描范围(S1)。
|
||||
func defaultOpenClawExpectedArtifactDirs() []string {
|
||||
return []string{"reports/", "artifacts/", "exports/"}
|
||||
}
|
||||
|
||||
func openClawArtifactContractForParams(params map[string]any, chatParams map[string]any) openClawArtifactContract {
|
||||
metadata := shared.AsMap(params["metadata"])
|
||||
taskLoadClass := strings.TrimSpace(shared.StringArg(metadata, "taskLoadClass", ""))
|
||||
@ -875,14 +869,6 @@ func openClawArtifactContractForParams(params map[string]any, chatParams map[str
|
||||
if len(requiredExts) == 0 {
|
||||
requiredExts = inferOpenClawRequiredArtifactExts(lowerMessage)
|
||||
}
|
||||
// S1(docs/cases/06 §7):任务期望产物(需导出 或 已推断出 requiredExts)却没有显式声明
|
||||
// expectedArtifactDirs 时,补一组缺省目录。openclaw-multi-session-plugins 在 task scope
|
||||
// 目录为空时会回扫 workspace 根的 expectedArtifactDirs;该列表为空则兜底形同虚设,
|
||||
// agent 写到 workspace 根(reports//artifacts//exports/)的产物就再也收不回(表现「暂无文件」)。
|
||||
if len(expectedDirs) == 0 && (requiresExport || len(requiredExts) > 0) {
|
||||
expectedDirs = defaultOpenClawExpectedArtifactDirs()
|
||||
requiresExport = true
|
||||
}
|
||||
expectedFileCounts := normalizeOpenClawArtifactExtCountMap(shared.AsMap(contract["expectedFileCountByExtension"]))
|
||||
if len(expectedFileCounts) == 0 {
|
||||
expectedFileCounts = normalizeOpenClawArtifactExtCountMap(shared.AsMap(metadata["expectedFileCountByExtension"]))
|
||||
@ -1701,11 +1687,16 @@ func applyOpenClawConstraintDeliveryStatus(result map[string]any) {
|
||||
|
||||
func gatewayRPCError(errorPayload map[string]any, fallback string) *shared.RPCError {
|
||||
if isOpenClawRetryableGatewayError(errorPayload) {
|
||||
metricGatewaySocketClosedInc() // T12
|
||||
// T10:连接断属「可重试 / run 可能仍在后台、可续轮询」语义,而非 run 确实失败。
|
||||
// 带 retryable/poll 提示,客户端据此降级为「后台续跑·重连中」(T5) 续轮询 tasks.get,而非硬失败。
|
||||
return &shared.RPCError{
|
||||
Code: -32002,
|
||||
Message: "OPENCLAW_GATEWAY_SOCKET_CLOSED: OpenClaw gateway connection closed during task execution",
|
||||
Data: map[string]any{
|
||||
"code": "OPENCLAW_GATEWAY_SOCKET_CLOSED",
|
||||
"retryable": true,
|
||||
"poll": true,
|
||||
"originalCode": strings.TrimSpace(shared.StringArg(errorPayload, "code", "")),
|
||||
"originalError": strings.TrimSpace(shared.StringArg(errorPayload, "message", "")),
|
||||
},
|
||||
|
||||
Loading…
Reference in New Issue
Block a user