fix(acp): T10/T11/T12 observability + error semantics; revert S1 (broke main)
T10: gatewayRPCError marks OPENCLAW_GATEWAY_SOCKET_CLOSED with retryable=true, poll=true so the client degrades to "background/reconnecting" + keeps polling instead of hard-failing (feeds App T5). T11: runId-tagged warn logs at the tasks.get unconfirmed-fallback and run-deadline-interrupt sites, so a runId can be joined across App→bridge→plugin→gateway. T12: process-level stability counters (gatewaySocketClosed, taskGetUnconfirmedFallback, runDeadlineInterrupt) exposed via /api/ping.metrics. Revert S1 (default expectedArtifactDirs): it set requiresExport=true / default dirs for any artifact-inferring task, which made a gateway run that succeeds with NO artifact hang "waiting for artifact export" (TestHTTPHandlerGatewayOpenClawHandlesFive ConcurrentE2ECases + ...WithoutPromptHeuristic went red). The blocking is tied to expectedArtifactDirs presence in openClawTaskGetRequiresArtifactExport; decoupling scan-hint from block-on-export needs a careful, separately-tested change. Reverted to keep main green; S1 to be redesigned (see docs/cases/06 §7). Full internal/acp suite green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
02808934c8
commit
fa9cc78add
@ -37,6 +37,7 @@ func (s *Server) Handler() http.Handler {
|
|||||||
"commit": info.Commit,
|
"commit": info.Commit,
|
||||||
"version": info.Version,
|
"version": info.Version,
|
||||||
"buildDate": info.BuildDate,
|
"buildDate": info.BuildDate,
|
||||||
|
"metrics": bridgeStabilityMetricsSnapshot(), // T12
|
||||||
}
|
}
|
||||||
body, _ := json.Marshal(resp)
|
body, _ := json.Marshal(resp)
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|||||||
29
internal/acp/metrics.go
Normal file
29
internal/acp/metrics.go
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
package acp
|
||||||
|
|
||||||
|
import "sync/atomic"
|
||||||
|
|
||||||
|
// 关键稳定性指标(T12,docs/cases/06 §5)。
|
||||||
|
//
|
||||||
|
// 进程内累计计数,经 /api/ping 暴露,用于把「网关抖动 / run 超时」从靠用户截图
|
||||||
|
// 变为可监控。三个计数对应三类已知的不稳定来源:
|
||||||
|
// - gatewaySocketClosed : gatewayRPCError 命中 OPENCLAW_GATEWAY_SOCKET_CLOSED(连接断)
|
||||||
|
// - taskGetUnconfirmedFallback: tasks.get 走持久 run 仓兜底(gateway 无法确认 run,T7)
|
||||||
|
// - runDeadlineInterrupt : run 超过 DeadlineAt 且 gateway 无法确认,回 interrupted(T9)
|
||||||
|
var bridgeStabilityMetrics struct {
|
||||||
|
gatewaySocketClosed atomic.Int64
|
||||||
|
taskGetUnconfirmedFallback atomic.Int64
|
||||||
|
runDeadlineInterrupt atomic.Int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func metricGatewaySocketClosedInc() { bridgeStabilityMetrics.gatewaySocketClosed.Add(1) }
|
||||||
|
func metricTaskGetUnconfirmedFallbackInc() { bridgeStabilityMetrics.taskGetUnconfirmedFallback.Add(1) }
|
||||||
|
func metricRunDeadlineInterruptInc() { bridgeStabilityMetrics.runDeadlineInterrupt.Add(1) }
|
||||||
|
|
||||||
|
// bridgeStabilityMetricsSnapshot 返回当前计数快照,供 /api/ping 输出。
|
||||||
|
func bridgeStabilityMetricsSnapshot() map[string]any {
|
||||||
|
return map[string]any{
|
||||||
|
"gatewaySocketClosed": bridgeStabilityMetrics.gatewaySocketClosed.Load(),
|
||||||
|
"taskGetUnconfirmedFallback": bridgeStabilityMetrics.taskGetUnconfirmedFallback.Load(),
|
||||||
|
"runDeadlineInterrupt": bridgeStabilityMetrics.runDeadlineInterrupt.Load(),
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,6 +1,7 @@
|
|||||||
package acp
|
package acp
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"log"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -125,11 +126,15 @@ func (s *Server) openClawTaskGetGatewayUnconfirmedFallback(params map[string]any
|
|||||||
return s.markOpenClawRunDeadlineInterruptedLocked(sess, code, message)
|
return s.markOpenClawRunDeadlineInterruptedLocked(sess, code, message)
|
||||||
}
|
}
|
||||||
// 仍在预算内:合成 running 句柄让客户端继续轮询,不因一次瞬时抖动硬失败。
|
// 仍在预算内:合成 running 句柄让客户端继续轮询,不因一次瞬时抖动硬失败。
|
||||||
|
metricTaskGetUnconfirmedFallbackInc() // T12
|
||||||
running := openClawRunningTaskResult(sess.openClaw)
|
running := openClawRunningTaskResult(sess.openClaw)
|
||||||
running["transportDegraded"] = true
|
running["transportDegraded"] = true
|
||||||
if strings.TrimSpace(code) != "" {
|
if strings.TrimSpace(code) != "" {
|
||||||
running["transportDegradedCode"] = strings.TrimSpace(code)
|
running["transportDegradedCode"] = strings.TrimSpace(code)
|
||||||
}
|
}
|
||||||
|
// T11:带 runId 的日志,便于与 App / 插件 / 网关四层按 runId 串联。
|
||||||
|
log.Printf("level=warn component=openclaw_run_registry event=tasks_get_unconfirmed_fallback runId=%q openclawSessionKey=%q code=%q",
|
||||||
|
sess.openClaw.RunID, sess.openClaw.SessionKey, strings.TrimSpace(code))
|
||||||
sess.lastResult = cloneMap(running)
|
sess.lastResult = cloneMap(running)
|
||||||
return running
|
return running
|
||||||
}
|
}
|
||||||
@ -143,6 +148,13 @@ func (s *Server) markOpenClawRunDeadlineInterruptedLocked(sess *session, code st
|
|||||||
sess.task.ProgressStage = "interrupted"
|
sess.task.ProgressStage = "interrupted"
|
||||||
sess.task.ProgressMessage = "OpenClaw run exceeded its budget and could not be confirmed"
|
sess.task.ProgressMessage = "OpenClaw run exceeded its budget and could not be confirmed"
|
||||||
sess.task.UpdatedAt = now
|
sess.task.UpdatedAt = now
|
||||||
|
metricRunDeadlineInterruptInc() // T12
|
||||||
|
// T11:带 runId 的终态日志。
|
||||||
|
if sess.openClaw != nil {
|
||||||
|
log.Printf("level=warn component=openclaw_run_registry event=run_deadline_interrupt runId=%q openclawSessionKey=%q deadlineAt=%q code=%q",
|
||||||
|
sess.openClaw.RunID, sess.openClaw.SessionKey,
|
||||||
|
sess.openClaw.DeadlineAt.UTC().Format(time.RFC3339Nano), strings.TrimSpace(code))
|
||||||
|
}
|
||||||
|
|
||||||
result := map[string]any{
|
result := map[string]any{
|
||||||
"ok": true,
|
"ok": true,
|
||||||
|
|||||||
@ -850,12 +850,6 @@ type openClawArtifactContract struct {
|
|||||||
SourceMessage string
|
SourceMessage string
|
||||||
}
|
}
|
||||||
|
|
||||||
// defaultOpenClawExpectedArtifactDirs 是 agent 最常用的产物落盘目录。当任务期望产物却没有
|
|
||||||
// 显式声明目录时,用作 openclaw-multi-session-plugins 的 workspace 根兜底扫描范围(S1)。
|
|
||||||
func defaultOpenClawExpectedArtifactDirs() []string {
|
|
||||||
return []string{"reports/", "artifacts/", "exports/"}
|
|
||||||
}
|
|
||||||
|
|
||||||
func openClawArtifactContractForParams(params map[string]any, chatParams map[string]any) openClawArtifactContract {
|
func openClawArtifactContractForParams(params map[string]any, chatParams map[string]any) openClawArtifactContract {
|
||||||
metadata := shared.AsMap(params["metadata"])
|
metadata := shared.AsMap(params["metadata"])
|
||||||
taskLoadClass := strings.TrimSpace(shared.StringArg(metadata, "taskLoadClass", ""))
|
taskLoadClass := strings.TrimSpace(shared.StringArg(metadata, "taskLoadClass", ""))
|
||||||
@ -875,14 +869,6 @@ func openClawArtifactContractForParams(params map[string]any, chatParams map[str
|
|||||||
if len(requiredExts) == 0 {
|
if len(requiredExts) == 0 {
|
||||||
requiredExts = inferOpenClawRequiredArtifactExts(lowerMessage)
|
requiredExts = inferOpenClawRequiredArtifactExts(lowerMessage)
|
||||||
}
|
}
|
||||||
// S1(docs/cases/06 §7):任务期望产物(需导出 或 已推断出 requiredExts)却没有显式声明
|
|
||||||
// expectedArtifactDirs 时,补一组缺省目录。openclaw-multi-session-plugins 在 task scope
|
|
||||||
// 目录为空时会回扫 workspace 根的 expectedArtifactDirs;该列表为空则兜底形同虚设,
|
|
||||||
// agent 写到 workspace 根(reports//artifacts//exports/)的产物就再也收不回(表现「暂无文件」)。
|
|
||||||
if len(expectedDirs) == 0 && (requiresExport || len(requiredExts) > 0) {
|
|
||||||
expectedDirs = defaultOpenClawExpectedArtifactDirs()
|
|
||||||
requiresExport = true
|
|
||||||
}
|
|
||||||
expectedFileCounts := normalizeOpenClawArtifactExtCountMap(shared.AsMap(contract["expectedFileCountByExtension"]))
|
expectedFileCounts := normalizeOpenClawArtifactExtCountMap(shared.AsMap(contract["expectedFileCountByExtension"]))
|
||||||
if len(expectedFileCounts) == 0 {
|
if len(expectedFileCounts) == 0 {
|
||||||
expectedFileCounts = normalizeOpenClawArtifactExtCountMap(shared.AsMap(metadata["expectedFileCountByExtension"]))
|
expectedFileCounts = normalizeOpenClawArtifactExtCountMap(shared.AsMap(metadata["expectedFileCountByExtension"]))
|
||||||
@ -1701,11 +1687,16 @@ func applyOpenClawConstraintDeliveryStatus(result map[string]any) {
|
|||||||
|
|
||||||
func gatewayRPCError(errorPayload map[string]any, fallback string) *shared.RPCError {
|
func gatewayRPCError(errorPayload map[string]any, fallback string) *shared.RPCError {
|
||||||
if isOpenClawRetryableGatewayError(errorPayload) {
|
if isOpenClawRetryableGatewayError(errorPayload) {
|
||||||
|
metricGatewaySocketClosedInc() // T12
|
||||||
|
// T10:连接断属「可重试 / run 可能仍在后台、可续轮询」语义,而非 run 确实失败。
|
||||||
|
// 带 retryable/poll 提示,客户端据此降级为「后台续跑·重连中」(T5) 续轮询 tasks.get,而非硬失败。
|
||||||
return &shared.RPCError{
|
return &shared.RPCError{
|
||||||
Code: -32002,
|
Code: -32002,
|
||||||
Message: "OPENCLAW_GATEWAY_SOCKET_CLOSED: OpenClaw gateway connection closed during task execution",
|
Message: "OPENCLAW_GATEWAY_SOCKET_CLOSED: OpenClaw gateway connection closed during task execution",
|
||||||
Data: map[string]any{
|
Data: map[string]any{
|
||||||
"code": "OPENCLAW_GATEWAY_SOCKET_CLOSED",
|
"code": "OPENCLAW_GATEWAY_SOCKET_CLOSED",
|
||||||
|
"retryable": true,
|
||||||
|
"poll": true,
|
||||||
"originalCode": strings.TrimSpace(shared.StringArg(errorPayload, "code", "")),
|
"originalCode": strings.TrimSpace(shared.StringArg(errorPayload, "code", "")),
|
||||||
"originalError": strings.TrimSpace(shared.StringArg(errorPayload, "message", "")),
|
"originalError": strings.TrimSpace(shared.StringArg(errorPayload, "message", "")),
|
||||||
},
|
},
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user