Add deterministic IT infra video skill chain

This commit is contained in:
Haitao Pan 2026-05-25 12:39:58 +08:00
parent 03645d5bfd
commit dc3719fee2
6 changed files with 893 additions and 0 deletions

View File

@ -16,6 +16,7 @@
| AI 信息差快报 | 新闻检索、素材匹配、口播字幕、视频渲染 | `skills/ai-tech-news-video/SKILL.md` |
| IT 基础设施连续 PNG | 根据描述或参考图生成 1-N 张连续风格竖版 PNG 素材 | `skills/it-infra-continuous-png/SKILL.md` |
| IT 基础设施长图讲解视频 | 基于长图素材生成 HyperFrames 讲解视频、口播、字幕和 timeline | `skills/it-infra-evolution-video/SKILL.md` |
| IT 基础设施长图讲解视频 v2 | 从 PNG manifest 强制生成配置、HTML、音频、验收和 MP4 | `skills/it-infra-evolution-video-v2/SKILL.md` |
| 产品介绍视频 | 官网信息提炼、叙事结构、成片节奏 | `skills/product-intro-video/SKILL.md` |
| 视频音效工作流 | 音效搜索、下载与合成、时间线接入 | `skills/sound-fx-for-video/SKILL.md` |
| 简笔画动画视频 | 线稿风 + 短画面字;**主动网络搜参考图临摹**逼真非抽象GSAP 主时间线 + 可选 Anime.js抽检闭环 | `skills/sketch-animation-video/SKILL.md` |
@ -50,6 +51,25 @@
2. 按文档准备素材、音频与脚本
3. 在项目中执行渲染与抽检流程
### IT 基础设施 PNG -> 视频闭环
当一个任务同时需要 `it-infra-continuous-png``it-infra-evolution-video` 时,优先使用 v2 链路:
1. `it-infra-continuous-png` 先输出 `assets/images/*.png``assets/images/manifest.md`
2. `it-infra-evolution-video-v2` 读取 manifest并调用 `scripts/build_it_infra_video.py`
3. 任务目录中必须留下 `video.config.json`、`index.html`、`renders/*.mp4`、`ffprobe.json`
示例:
```bash
python3 scripts/build_it_infra_video.py \
--project-dir /path/to/task/service-mesh-video \
--title "云原生 Service Mesh 网络科普视频" \
--audio-mode edge-tts \
--run-acceptance \
--output-name service-mesh-video.mp4
```
## 账号信息
- 名称:拓扑同学

664
scripts/build_it_infra_video.py Executable file
View File

@ -0,0 +1,664 @@
#!/usr/bin/env python3
"""Build an IT infrastructure explainer video project from a PNG manifest.
The runner is intentionally deterministic: it turns a manifest produced by
it-infra-continuous-png into one HyperFrames project, validates clip timing, and
optionally runs the HyperFrames/ffprobe acceptance chain.
"""
from __future__ import annotations
import argparse
import html
import json
import os
import re
import shutil
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
REQUIRED_MANIFEST_COLUMNS = [
"chapter_id",
"title",
"file",
"source_type",
"video_usage",
"scan_mode",
"safe_focus",
]
PNG_MAGIC = b"\x89PNG\r\n\x1a\n"
HYPERFRAMES_VERSION = "0.6.15"
class BuildError(RuntimeError):
pass
@dataclass(frozen=True)
class ManifestRow:
chapter_id: str
title: str
file: str
source_type: str
video_usage: str
scan_mode: str
safe_focus: str
@dataclass(frozen=True)
class Section:
id: str
start: float
duration: float
time_label: str
timeline_label: str
title: str
subtitle: str
tags: list[str]
image: str
image_fit: str
voiceover: str
caption: str
source_type: str
safe_focus: str
def fail(message: str) -> None:
raise BuildError(message)
def run(cmd: list[str], cwd: Path, *, capture: bool = False) -> subprocess.CompletedProcess[str]:
print("+ " + " ".join(cmd), flush=True)
return subprocess.run(
cmd,
cwd=str(cwd),
check=True,
text=True,
stdout=subprocess.PIPE if capture else None,
stderr=subprocess.STDOUT if capture else None,
)
def slugify(value: str, fallback: str) -> str:
value = value.strip().lower()
value = re.sub(r"[^a-z0-9_-]+", "-", value)
value = re.sub(r"-{2,}", "-", value).strip("-")
return value or fallback
def parse_markdown_table(path: Path) -> list[dict[str, str]]:
if not path.exists():
fail(f"Manifest not found: {path}")
lines = [line.strip() for line in path.read_text(encoding="utf-8").splitlines()]
table_lines = [line for line in lines if line.startswith("|") and line.endswith("|")]
if len(table_lines) < 3:
fail(f"Manifest must contain a markdown table with data rows: {path}")
headers = [cell.strip() for cell in table_lines[0].strip("|").split("|")]
missing = [column for column in REQUIRED_MANIFEST_COLUMNS if column not in headers]
if missing:
fail(f"Manifest missing required columns: {', '.join(missing)}")
rows: list[dict[str, str]] = []
for line in table_lines[2:]:
cells = [cell.strip() for cell in line.strip("|").split("|")]
if len(cells) != len(headers):
fail(f"Manifest row has {len(cells)} cells but header has {len(headers)}: {line}")
row = dict(zip(headers, cells, strict=True))
if any(row[column] for column in REQUIRED_MANIFEST_COLUMNS):
rows.append(row)
if not rows:
fail("Manifest has no image rows")
return rows
def read_manifest(path: Path, project_dir: Path) -> list[ManifestRow]:
rows = []
for index, raw in enumerate(parse_markdown_table(path), start=1):
row = ManifestRow(**{column: raw[column] for column in REQUIRED_MANIFEST_COLUMNS})
if not row.chapter_id:
fail(f"Manifest row {index} has an empty chapter_id")
if row.scan_mode not in {"cover", "contain"}:
fail(f"Manifest row {index} scan_mode must be cover or contain: {row.scan_mode}")
image_path = project_dir / row.file
if not image_path.exists():
fail(f"Manifest row {index} image file not found: {row.file}")
if image_path.read_bytes()[:8] != PNG_MAGIC:
fail(f"Manifest row {index} image is not a real PNG: {row.file}")
rows.append(row)
return rows
def format_time(seconds: float) -> str:
total = max(0, int(round(seconds)))
return f"{total // 60}:{total % 60:02d}"
def build_sections(rows: list[ManifestRow], section_duration: float) -> list[Section]:
sections: list[Section] = []
for index, row in enumerate(rows):
start = round(index * section_duration, 3)
chapter_id = slugify(row.chapter_id, f"chapter-{index + 1}")
title = row.title.strip()
subtitle = row.video_usage.strip() or row.safe_focus.strip()
caption = f"{title}: {subtitle}" if subtitle else title
tags = [
row.source_type.replace("_", " "),
"long image",
row.scan_mode,
]
sections.append(
Section(
id=chapter_id,
start=start,
duration=section_duration,
time_label=format_time(start),
timeline_label=title[:8] or f"Chapter {index + 1}",
title=title,
subtitle=subtitle,
tags=tags,
image=row.file,
image_fit=row.scan_mode,
voiceover=f"assets/audio/vo-{index + 1:02d}-{chapter_id}.mp3",
caption=caption,
source_type=row.source_type,
safe_focus=row.safe_focus,
)
)
validate_non_overlapping("section", ((s.start, s.duration, s.id) for s in sections))
return sections
def validate_non_overlapping(name: str, clips: Iterable[tuple[float, float, str]]) -> None:
previous_end = -1.0
previous_id = ""
for start, duration, clip_id in sorted(clips):
if duration <= 0:
fail(f"{name} clip has non-positive duration: {clip_id}")
if start < previous_end - 0.001:
fail(f"{name} clips overlap: {previous_id} and {clip_id}")
previous_end = start + duration
previous_id = clip_id
def write_json_config(project_dir: Path, title: str, sections: list[Section]) -> dict:
duration = round(max(s.start + s.duration for s in sections), 3)
config = {
"duration": duration,
"timelineColumns": len(sections),
"canvas": {"width": 1920, "height": 1080},
"stylePreset": "it-infra-v2-blue-white-two-column-scan",
"title": title,
"sections": [
{
"id": section.id,
"start": section.start,
"duration": section.duration,
"timeLabel": section.time_label,
"timelineLabel": section.timeline_label,
"title": section.title,
"subtitle": section.subtitle,
"tags": section.tags,
"image": section.image,
"imageFit": section.image_fit,
"voiceover": section.voiceover,
"caption": section.caption,
"sourceType": section.source_type,
"safeFocus": section.safe_focus,
}
for section in sections
],
"inspectTimes": [round(section.start + section.duration / 2, 3) for section in sections],
}
(project_dir / "video.config.json").write_text(
json.dumps(config, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
return config
def ensure_project_scaffold(project_dir: Path) -> None:
for relative in ["assets/audio", "assets/images", "renders", "snapshots"]:
(project_dir / relative).mkdir(parents=True, exist_ok=True)
package_json = project_dir / "package.json"
if not package_json.exists():
package_json.write_text(
json.dumps(
{
"name": "it-infra-evolution-video-v2-project",
"private": True,
"type": "module",
"scripts": {
"lint": f"npx --yes hyperframes@{HYPERFRAMES_VERSION} lint",
"inspect": f"npx --yes hyperframes@{HYPERFRAMES_VERSION} inspect",
"snapshot": f"npx --yes hyperframes@{HYPERFRAMES_VERSION} snapshot",
"render": f"npx --yes hyperframes@{HYPERFRAMES_VERSION} render",
},
},
indent=2,
)
+ "\n",
encoding="utf-8",
)
hyperframes_json = project_dir / "hyperframes.json"
if not hyperframes_json.exists():
hyperframes_json.write_text(
json.dumps(
{
"$schema": "https://hyperframes.heygen.com/schema/hyperframes.json",
"registry": "https://raw.githubusercontent.com/heygen-com/hyperframes/main/registry",
"paths": {
"blocks": "compositions",
"components": "compositions/components",
"assets": "assets",
},
},
indent=2,
)
+ "\n",
encoding="utf-8",
)
def generate_tone_audio(project_dir: Path, sections: list[Section]) -> None:
if not shutil.which("ffmpeg"):
fail("ffmpeg is required for --audio-mode tone")
for index, section in enumerate(sections, start=1):
out = project_dir / section.voiceover
out.parent.mkdir(parents=True, exist_ok=True)
frequency = str(360 + index * 60)
duration = str(max(0.4, section.duration - 0.4))
run(
[
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
f"sine=frequency={frequency}:duration={duration}",
"-q:a",
"9",
str(out),
],
project_dir,
capture=True,
)
bgm = project_dir / "assets/audio/bgm.wav"
total_duration = str(max(s.start + s.duration for s in sections))
run(
[
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
f"sine=frequency=120:duration={total_duration}",
"-filter:a",
"volume=0.08",
str(bgm),
],
project_dir,
capture=True,
)
def generate_edge_tts_audio(project_dir: Path, sections: list[Section]) -> None:
if not shutil.which("edge-tts"):
fail("edge-tts is required for production voiceover generation")
for section in sections:
out = project_dir / section.voiceover
out.parent.mkdir(parents=True, exist_ok=True)
run(
[
"edge-tts",
"--voice",
"zh-CN-YunxiNeural",
"--rate",
"+20%",
"--text",
section.caption,
"--write-media",
str(out),
],
project_dir,
)
bgm = project_dir / "assets/audio/bgm.wav"
if not bgm.exists():
generate_tone_bgm(project_dir, max(s.start + s.duration for s in sections))
def generate_tone_bgm(project_dir: Path, duration: float) -> None:
if not shutil.which("ffmpeg"):
fail("ffmpeg is required to synthesize fallback BGM")
run(
[
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
f"sine=frequency=120:duration={duration}",
"-filter:a",
"volume=0.08",
"assets/audio/bgm.wav",
],
project_dir,
capture=True,
)
def css() -> str:
return """
:root { --timeline-columns: 1; }
* { margin: 0; padding: 0; box-sizing: border-box; }
html, body {
width: 1920px;
height: 1080px;
overflow: hidden;
background: #f3faff;
font-family: Inter, "Noto Sans JP", Arial, sans-serif;
color: #07194f;
}
#root {
position: relative;
width: 1920px;
height: 1080px;
overflow: hidden;
background:
radial-gradient(circle at 82% 16%, rgba(73,217,255,0.26), transparent 30%),
radial-gradient(circle at 10% 92%, rgba(21,91,255,0.15), transparent 28%),
linear-gradient(135deg, #ffffff 0%, #f3faff 46%, #dceeff 100%);
}
.clip { position: absolute; overflow: hidden; }
.scene { inset: 0; opacity: 0; }
.topbar {
position: absolute; z-index: 40; top: 42px; left: 72px; right: 72px;
display: flex; justify-content: space-between; align-items: center;
font-size: 26px; font-weight: 850; color: rgba(7,25,79,0.82);
}
.brand-pill {
display: inline-flex; gap: 14px; align-items: center; padding: 14px 24px;
border-radius: 999px; color: #fff; background: linear-gradient(135deg, #155bff, #18bfa6);
box-shadow: 0 16px 40px rgba(21,91,255,0.22);
}
.brand-dot { width: 14px; height: 14px; border-radius: 50%; background: #fff; box-shadow: 0 0 24px #49d9ff; }
.scene-content {
width: 100%; height: 100%; padding: 92px 104px 214px;
display: grid; grid-template-columns: 0.96fr 1.04fr; gap: 58px; align-items: center;
}
.image-panel {
position: relative; width: 100%; height: 760px; border-radius: 34px; overflow: hidden;
background: #fff; border: 1px solid rgba(21,91,255,0.16);
box-shadow: 0 34px 90px rgba(17,60,128,0.18);
}
.image-panel img {
position: absolute; inset: 0; width: 100%; height: 100%; object-fit: cover;
object-position: center top; filter: saturate(1.06) contrast(1.03);
}
.image-panel.contain img { object-fit: contain; padding: 20px; background: #fff; }
.copy { position: relative; z-index: 2; display: flex; flex-direction: column; gap: 26px; }
.kicker {
width: max-content; max-width: 100%; color: #155bff; background: rgba(21,91,255,0.08);
border: 1px solid rgba(21,91,255,0.22); border-radius: 999px; padding: 12px 22px;
font-size: 27px; font-weight: 950;
}
h1, h2 { font-size: 64px; line-height: 1.08; font-weight: 950; letter-spacing: 0; }
.lead { font-size: 32px; line-height: 1.55; font-weight: 760; color: rgba(7,25,79,0.78); }
.tag-row { display: flex; gap: 14px; flex-wrap: wrap; }
.tag {
padding: 12px 16px; border-radius: 16px; color: #07194f; background: rgba(255,255,255,0.76);
border: 1px solid rgba(73,217,255,0.34); box-shadow: 0 14px 30px rgba(17,60,128,0.08);
font-size: 24px; font-weight: 900;
}
.caption {
position: absolute; z-index: 55; left: 320px; right: 320px; bottom: 132px; min-height: 78px;
display: grid; place-items: center; padding: 14px 32px; border-radius: 26px;
background: rgba(7,25,79,0.88); color: #fff; font-size: 32px; line-height: 1.34;
font-weight: 850; text-align: center; box-shadow: 0 20px 55px rgba(7,25,79,0.26); opacity: 0;
}
.timeline {
position: absolute; z-index: 52; left: 70px; right: 70px; bottom: 28px; height: 88px;
padding: 12px 14px 18px; display: grid; grid-template-columns: repeat(var(--timeline-columns), minmax(0, 1fr));
gap: 10px; align-items: center; overflow: visible; border-radius: 30px;
background: rgba(255,255,255,0.76); border: 1px solid rgba(21,91,255,0.15);
box-shadow: 0 20px 60px rgba(17,60,128,0.14); backdrop-filter: blur(12px);
}
.timeline-fill { position: absolute; left: 22px; right: 22px; bottom: 8px; height: 8px; border-radius: 999px; background: rgba(7,25,79,0.12); overflow: hidden; }
.timeline-progress { display: block; width: 0%; height: 100%; border-radius: inherit; background: linear-gradient(90deg, #155bff, #49d9ff, #18bfa6); }
.chapter-tag {
position: relative; z-index: 2; min-width: 0; height: 50px; display: flex; align-items: center; justify-content: center;
gap: 8px; padding: 0 10px; border-radius: 18px; background: rgba(255,255,255,0.72);
border: 1px solid rgba(21,91,255,0.18); color: rgba(7,25,79,0.74); font-size: 20px;
line-height: 1; font-weight: 900; white-space: nowrap; box-shadow: 0 12px 26px rgba(17,60,128,0.08);
}
.chapter-time { color: #155bff; font-variant-numeric: tabular-nums; }
.chapter-title { overflow: hidden; text-overflow: ellipsis; }
.chapter-tag.active { color: #fff; background: linear-gradient(135deg, #155bff, #18bfa6); border-color: rgba(255,255,255,0.62); box-shadow: 0 20px 42px rgba(21,91,255,0.28); }
.chapter-tag.active .chapter-time { color: #fff; }
.glow-line { position: absolute; width: 560px; height: 560px; border-radius: 50%; border: 2px solid rgba(73,217,255,0.34); right: -160px; top: -150px; }
"""
def js_array(values: list[str | float]) -> str:
return json.dumps(values, ensure_ascii=False)
def write_html(project_dir: Path, title: str, config: dict) -> None:
sections = config["sections"]
duration = config["duration"]
timeline_columns = config["timelineColumns"]
scene_html = []
caption_html = []
chapter_html = []
audio_html = [
f'<audio id="bgm" class="clip" data-start="0" data-duration="{duration}" data-track-index="20" data-volume="0.10" src="assets/audio/bgm.wav"></audio>'
]
for index, section in enumerate(sections):
scene_id = f"scene-{section['id']}"
cap_id = f"cap-{index + 1:02d}-{section['id']}"
vo_id = f"vo-{index + 1:02d}-{section['id']}"
image_panel_class = "image-panel contain" if section["imageFit"] == "contain" else "image-panel"
tags = "".join(f'<span class="tag">{html.escape(tag)}</span>' for tag in section["tags"])
heading_tag = "h1" if index == 0 else "h2"
caption_start = round(section["start"] + 0.2, 3)
caption_duration = round(max(0.4, section["duration"] - 0.4), 3)
scene_html.append(
f"""
<section id="{scene_id}" class="clip scene" data-start="{section['start']}" data-duration="{section['duration']}" data-track-index="1">
<div class="scene-content">
<div class="{image_panel_class}"><img data-layout-allow-overflow src="{html.escape(section['image'])}" alt="{html.escape(section['title'])}" /></div>
<div class="copy">
<div class="kicker">{html.escape(section['timeLabel'])} / {html.escape(section['timelineLabel'])}</div>
<{heading_tag}>{html.escape(section['title'])}</{heading_tag}>
<p class="lead">{html.escape(section['subtitle'])}</p>
<div class="tag-row">{tags}</div>
</div>
</div>
</section>"""
)
caption_html.append(
f'<div class="caption clip" id="{cap_id}" data-start="{caption_start}" data-duration="{caption_duration}" data-track-index="10">{html.escape(section["caption"])}</div>'
)
chapter_html.append(
f'<div class="chapter-tag" id="chapter-{index}"><span class="chapter-time">{html.escape(section["timeLabel"])}</span><span class="chapter-title">{html.escape(section["timelineLabel"])}</span></div>'
)
audio_html.append(
f'<audio id="{vo_id}" class="clip" data-start="{section["start"]}" data-duration="{caption_duration}" data-track-index="5" data-volume="0.92" src="{html.escape(section["voiceover"])}"></audio>'
)
starts = [section["start"] for section in sections]
durations = [section["duration"] for section in sections]
scenes = [f"#scene-{section['id']}" for section in sections]
captions = [f"#cap-{index + 1:02d}-{section['id']}" for index, section in enumerate(sections)]
chapters = [f"#chapter-{index}" for index in range(len(sections))]
index_html = f"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=1920, height=1080" />
<script src="https://cdn.jsdelivr.net/npm/gsap@3.14.2/dist/gsap.min.js"></script>
<style>{css()}</style>
</head>
<body>
<div id="root" data-composition-id="main" data-start="0" data-duration="{duration}" data-width="1920" data-height="1080" style="--timeline-columns: {timeline_columns}">
<div class="glow-line" data-layout-ignore></div>
<div class="topbar" data-layout-ignore>
<div class="brand-pill"><span class="brand-dot"></span><span>{html.escape(title)}</span></div>
<div>PNG manifest -> HyperFrames -> MP4</div>
</div>
{''.join(scene_html)}
{''.join(caption_html)}
<div class="timeline" data-layout-ignore>
{''.join(chapter_html)}
<div class="timeline-fill"><div class="timeline-progress"></div></div>
</div>
{''.join(audio_html)}
</div>
<script>
window.__timelines = window.__timelines || {{}};
const rootDuration = Number(document.querySelector("#root").dataset.duration || {duration});
const tl = gsap.timeline({{ paused: true }});
const scenes = {js_array(scenes)};
const starts = {js_array(starts)};
const durations = {js_array(durations)};
const captions = {js_array(captions)};
const chapters = {js_array(chapters)};
scenes.forEach((scene, index) => {{
const start = starts[index];
const duration = durations[index];
tl.set(scene, {{ opacity: 1 }}, start);
tl.to(scene, {{ opacity: 0, duration: 0.28, ease: "power1.in" }}, start + duration - 0.28);
tl.from(`${{scene}} .image-panel`, {{ x: -78, opacity: 0, scale: 0.96, duration: 0.72, ease: "power3.out" }}, start + 0.08);
tl.from(`${{scene}} .kicker`, {{ y: 28, opacity: 0, duration: 0.42, ease: "power2.out" }}, start + 0.18);
tl.from(`${{scene}} h1, ${{scene}} h2`, {{ y: 46, opacity: 0, duration: 0.62, ease: "power3.out" }}, start + 0.3);
tl.from(`${{scene}} .lead`, {{ y: 36, opacity: 0, duration: 0.54, ease: "power2.out" }}, start + 0.58);
tl.from(`${{scene}} .tag`, {{ y: 24, opacity: 0, scale: 0.94, duration: 0.42, stagger: 0.06, ease: "power2.out" }}, start + 0.82);
tl.to(`${{scene}} .image-panel img`, {{ y: -70, scale: 1.1, duration: Math.max(4, duration - 1), ease: "none" }}, start + 0.4);
}});
captions.forEach((caption, index) => {{
const start = starts[index] + 0.2;
const duration = Math.max(0.4, durations[index] - 0.4);
tl.to(caption, {{ opacity: 1, y: 0, duration: 0.16, ease: "power1.out" }}, start);
tl.to(caption, {{ opacity: 0, y: 14, duration: 0.16, ease: "power1.in" }}, start + duration - 0.16);
}});
chapters.forEach((chapter, index) => {{
const start = starts[index];
const duration = durations[index];
tl.to(chapter, {{ y: -8, scale: 1.04, duration: 0.18, ease: "power1.out" }}, start);
tl.set(chapter, {{ className: "chapter-tag active" }}, start);
tl.set(chapter, {{ className: "chapter-tag" }}, start + duration - 0.1);
tl.to(chapter, {{ y: 0, scale: 1, duration: 0.18, ease: "power1.in" }}, start + duration - 0.28);
}});
tl.to(".timeline-progress", {{ width: "100%", duration: rootDuration, ease: "none" }}, 0);
tl.from(".topbar", {{ y: -28, opacity: 0, duration: 0.5, ease: "power2.out" }}, 0.1);
tl.to(".glow-line", {{ scale: 1.18, rotation: 20, duration: rootDuration, ease: "none" }}, 0);
tl.to("#root", {{ opacity: 0, duration: 0.65, ease: "power2.in" }}, Math.max(0, rootDuration - 0.75));
window.__timelines["main"] = tl;
</script>
</body>
</html>
"""
(project_dir / "index.html").write_text(index_html, encoding="utf-8")
def doctor(audio_mode: str, run_acceptance: bool) -> None:
required = ["ffmpeg", "ffprobe", "npx"]
if audio_mode == "edge-tts":
required.append("edge-tts")
missing = [tool for tool in required if not shutil.which(tool)]
if missing:
fail(f"Missing required tool(s): {', '.join(missing)}")
if run_acceptance and not shutil.which("npx"):
fail("npx is required to run HyperFrames acceptance")
def run_acceptance(project_dir: Path, config: dict, output_name: str) -> None:
inspect_at = ",".join(str(value) for value in config["inspectTimes"])
output_path = f"renders/{output_name}"
run(["npx", "--yes", f"hyperframes@{HYPERFRAMES_VERSION}", "lint"], project_dir)
run(["npx", "--yes", f"hyperframes@{HYPERFRAMES_VERSION}", "inspect", "--at", inspect_at], project_dir)
run(["npx", "--yes", f"hyperframes@{HYPERFRAMES_VERSION}", "snapshot", "--at", inspect_at], project_dir)
run(["npx", "--yes", f"hyperframes@{HYPERFRAMES_VERSION}", "render", "--output", output_path, "--quality", "standard"], project_dir)
probe = run(
[
"ffprobe",
"-v",
"quiet",
"-show_entries",
"format=duration,size:stream=codec_type,width,height,r_frame_rate",
"-of",
"json",
output_path,
],
project_dir,
capture=True,
)
probe_json = json.loads(probe.stdout or "{}")
streams = probe_json.get("streams", [])
has_video = any(stream.get("codec_type") == "video" for stream in streams)
has_audio = any(stream.get("codec_type") == "audio" for stream in streams)
video_stream = next((stream for stream in streams if stream.get("codec_type") == "video"), {})
if not has_video or not has_audio:
fail("ffprobe acceptance failed: rendered MP4 must contain video and audio streams")
if video_stream.get("width") != 1920 or video_stream.get("height") != 1080:
fail(f"ffprobe acceptance failed: expected 1920x1080, got {video_stream.get('width')}x{video_stream.get('height')}")
actual_duration = float(probe_json.get("format", {}).get("duration", 0) or 0)
expected_duration = float(config["duration"])
tolerance = max(3.0, expected_duration * 0.15)
if abs(actual_duration - expected_duration) > tolerance:
fail(
"ffprobe acceptance failed: "
f"expected duration near {expected_duration:.3f}s, got {actual_duration:.3f}s"
)
(project_dir / "ffprobe.json").write_text(json.dumps(probe_json, indent=2) + "\n", encoding="utf-8")
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--project-dir", type=Path, default=Path.cwd(), help="HyperFrames project directory")
parser.add_argument("--manifest", type=Path, default=None, help="PNG manifest path")
parser.add_argument("--title", default="IT 基础设施长图讲解视频", help="Video title")
parser.add_argument("--section-duration", type=float, default=8.0, help="Seconds per manifest row")
parser.add_argument("--audio-mode", choices=["edge-tts", "tone", "none"], default="edge-tts")
parser.add_argument("--run-acceptance", action="store_true", help="Run lint/inspect/snapshot/render/ffprobe")
parser.add_argument("--output-name", default="it-infra-evolution.mp4", help="Rendered MP4 file name")
args = parser.parse_args(argv)
try:
project_dir = args.project_dir.resolve()
manifest = (args.manifest or project_dir / "assets/images/manifest.md").resolve()
ensure_project_scaffold(project_dir)
doctor(args.audio_mode, args.run_acceptance)
rows = read_manifest(manifest, project_dir)
sections = build_sections(rows, args.section_duration)
config = write_json_config(project_dir, args.title, sections)
if args.audio_mode == "edge-tts":
generate_edge_tts_audio(project_dir, sections)
elif args.audio_mode == "tone":
generate_tone_audio(project_dir, sections)
elif not (project_dir / "assets/audio/bgm.wav").exists():
fail("--audio-mode none requires existing assets/audio/bgm.wav")
write_html(project_dir, args.title, config)
if args.run_acceptance:
run_acceptance(project_dir, config, args.output_name)
print("Build complete. Required task artifacts: index.html, video.config.json, assets/images/manifest.md, assets/audio/, renders/ or run with --run-acceptance.")
return 0
except (BuildError, subprocess.CalledProcessError, json.JSONDecodeError) as exc:
print(f"Build failed: {exc}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -81,6 +81,23 @@ description: "生成 IT 基础设施系列连续风格 PNG 图片。适用于一
`it-infra-evolution-video` 不应重新发明这些图片的风格,只读取 manifest 并作为真实长图素材使用。
当任务还选择了 `it-infra-evolution-video-v2` 时,本 skill 完成后必须停在清晰的交接点:
1. 确认 `assets/images/*.png` 的数量与 manifest 数据行数量一致。
2. 确认每个 `file` 指向真实 PNG 文件,而不是 SVG、空文件或占位路径。
3. 将下一步命令写给视频 skill
```bash
python3 /path/to/ai-video-skills/scripts/build_it_infra_video.py \
--project-dir . \
--title "<用户主题>" \
--audio-mode edge-tts \
--run-acceptance \
--output-name "<topic-slug>.mp4"
```
不要在本 skill 中生成 `index.html`、`video.config.json` 或 MP4这些是视频 skill 的职责。
## 参考文件
- 风格规范:`references/style-spec.md`

View File

@ -0,0 +1,73 @@
---
name: it-infra-evolution-video-v2
version: "v2"
description: "从 it-infra-continuous-png 的真实 PNG manifest 生成 IT 基础设施长图讲解视频。强制执行 manifest -> video.config.json -> index.html -> audio -> HyperFrames acceptance -> MP4 -> ffprobe 的闭环。"
---
# IT 基础设施长图讲解视频 v2
本 skill 是 `it-infra-evolution-video` 的可执行 v2 路径。v1 模板保持 frozenv2 的主路径必须通过仓库 runner 完成,不再让 Agent 临时手写 `generate_index.py` 或自由拼接模板片段。
## 调用前置条件
必须先完成 `it-infra-continuous-png`
- `assets/images/*.png` 存在,且每个文件是真实 PNG。
- `assets/images/manifest.md` 存在。
- manifest 每一行都包含 `chapter_id`、`title`、`file`、`source_type`、`video_usage`、`scan_mode`、`safe_focus`。
缺少这些输入时,不要继续生成视频,不要用 CSS 卡片、假截图或 SVG 冒充 PNG。
## 标准调用
在当前任务工作目录或视频项目目录执行:
```bash
python3 /path/to/ai-video-skills/scripts/build_it_infra_video.py \
--project-dir . \
--title "云原生 Service Mesh 网络科普视频" \
--audio-mode edge-tts \
--run-acceptance \
--output-name service-mesh-video.mp4
```
OpenClaw 任务中如果同时选择了 `it-infra-continuous-png``it-infra-evolution-video-v2`,必须按以下顺序执行:
1. 先用 `it-infra-continuous-png` 生成多张 PNG 和 manifest。
2. 再用本 skill 的 runner 读取 manifest。
3. 最后把 `renders/service-mesh-video.mp4`、`video.config.json`、`assets/images/manifest.md`、`ffprobe.json` 留在当前 task workspace。
## Runner 合同
runner 负责:
- 解析并校验 manifest。
- 拒绝缺失图片、伪 PNG、缺失列、非法 `scan_mode`
- 生成唯一 ID 的 `index.html`
- 保证 scene、caption、voiceover 在各自 track 上不重叠。
- 只保留一个全局 BGM 音轨。
- 生成 `video.config.json``inspectTimes`
- 执行 `lint -> inspect -> snapshot -> render -> ffprobe`
生产模式默认 `--audio-mode edge-tts`。本地测试或无网络 dry-run 可以使用 `--audio-mode tone`,但不能把 tone 输出当作正式口播成片。
## 验收标准
只有以下文件都存在,才能在 XWorkmate/OpenClaw 中报告完成:
- `index.html`
- `video.config.json`
- `assets/images/manifest.md`
- `assets/audio/*.mp3`
- `assets/audio/bgm.wav`
- `renders/<output-name>.mp4`
- `ffprobe.json`
`ffprobe.json` 必须显示:
- 分辨率为 `1920x1080`
- 有 video stream
- 有 audio stream
- 时长接近 `video.config.json``duration`
如果 HyperFrames 或 ffprobe 任一阶段失败,只输出失败阶段和原因,不输出“完成”。

View File

@ -0,0 +1,7 @@
# Fixture Image Manifest
| chapter_id | title | file | source_type | video_usage | scan_mode | safe_focus |
| --- | --- | --- | --- | --- | --- | --- |
| service-mesh-control-plane | 控制平面 | assets/images/001-control-plane.png | generated_from_description | 解释 Service Mesh 如何下发流量治理策略 | contain | center diagram and top title |
| service-mesh-data-plane | 数据平面 | assets/images/002-data-plane.png | generated_from_description | 解释 Sidecar 如何接管东西向流量 | cover | middle service nodes |
| service-mesh-observability | 可观测性 | assets/images/003-observability.png | generated_from_description | 解释指标、日志和链路追踪如何汇总 | contain | bottom telemetry cards |

View File

@ -0,0 +1,112 @@
import importlib.util
import json
import shutil
import struct
import sys
import tempfile
import unittest
import zlib
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / "scripts/build_it_infra_video.py"
FIXTURE = ROOT / "tests/fixtures/it-infra-chain"
spec = importlib.util.spec_from_file_location("build_it_infra_video", SCRIPT)
runner = importlib.util.module_from_spec(spec)
assert spec.loader is not None
sys.modules[spec.name] = runner
spec.loader.exec_module(runner)
def write_png(path: Path, rgb: tuple[int, int, int]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
width = height = 16
raw = b"".join(b"\x00" + bytes(rgb) * width for _ in range(height))
def chunk(kind: bytes, data: bytes) -> bytes:
return (
struct.pack(">I", len(data))
+ kind
+ data
+ struct.pack(">I", zlib.crc32(kind + data) & 0xFFFFFFFF)
)
path.write_bytes(
b"\x89PNG\r\n\x1a\n"
+ chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0))
+ chunk(b"IDAT", zlib.compress(raw))
+ chunk(b"IEND", b"")
)
def copy_fixture(tmp_path: Path) -> Path:
project = tmp_path / "project"
shutil.copytree(FIXTURE, project)
write_png(project / "assets/images/001-control-plane.png", (21, 91, 255))
write_png(project / "assets/images/002-data-plane.png", (24, 191, 166))
write_png(project / "assets/images/003-observability.png", (73, 217, 255))
return project
class BuildItInfraVideoTest(unittest.TestCase):
def test_manifest_drives_config_and_html_without_duplicate_ids(self):
with tempfile.TemporaryDirectory() as tmp:
project = copy_fixture(Path(tmp))
code = runner.main(
[
"--project-dir",
str(project),
"--title",
"Service Mesh fixture",
"--audio-mode",
"none",
]
)
self.assertEqual(code, 1)
self.assertFalse((project / "index.html").exists())
code = runner.main(
[
"--project-dir",
str(project),
"--title",
"Service Mesh fixture",
"--audio-mode",
"tone",
"--section-duration",
"1.2",
]
)
self.assertEqual(code, 0)
config = json.loads((project / "video.config.json").read_text(encoding="utf-8"))
html = (project / "index.html").read_text(encoding="utf-8")
self.assertEqual(config["timelineColumns"], 3)
self.assertEqual(len(config["sections"]), 3)
self.assertEqual(config["sections"][0]["image"], "assets/images/001-control-plane.png")
self.assertEqual(config["sections"][1]["start"], 1.2)
self.assertEqual(html.count('id="bgm"'), 1)
self.assertEqual(html.count('id="scene-service-mesh-control-plane"'), 1)
self.assertEqual(html.count('data-track-index="1"'), 3)
self.assertEqual(html.count('data-track-index="5"'), 3)
def test_rejects_manifest_image_that_is_not_real_png(self):
with tempfile.TemporaryDirectory() as tmp:
project = copy_fixture(Path(tmp))
(project / "assets/images/002-data-plane.png").write_text("<svg></svg>", encoding="utf-8")
rows = runner.parse_markdown_table(project / "assets/images/manifest.md")
self.assertEqual(len(rows), 3)
with self.assertRaisesRegex(runner.BuildError, "not a real PNG"):
runner.read_manifest(project / "assets/images/manifest.md", project)
if __name__ == "__main__":
unittest.main()