accounts/scripts/gen_docs_manifest.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Generate a documentation manifest for dl.svc.plus/docs.

The script scans the documentation tree (typically mounted at
``/data/update-server/docs``) and emits a single ``all.json`` file containing
metadata for every HTML/PDF pair that can be presented in the dashboard docs
portal.

It infers titles, categories, versions and presentation tags from the directory
structure and provides canonical URLs to the rendered assets. The resulting
JSON structure is a list of ``DocResource`` dictionaries that match the shape
consumed by the Next.js UI under ``dashboard/app/docs``.

Usage example::

    python3 scripts/gen_docs_manifest.py \
        --root /data/update-server/docs \
        --base-url-prefix https://dl.svc.plus/docs

The command is idempotent and safe to rerun. Hidden files/directories (prefixed
with ``.``) are ignored. Only ``.pdf`` and ``.html`` assets are considered for
listing.
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple

SUPPORTED_EXTENSIONS = {".pdf", ".html"}


@dataclass
class DocEntry:
    slug: str
    title: str
    category: Optional[str]
    version: Optional[str]
    version_dir: Optional[str]
    version_slug: Optional[str]
    collection_dir: Optional[str]
    collection_slug: Optional[str]
    collection_label: Optional[str]
    language: Optional[str]
    description: str
    pdf_url: Optional[str]
    html_url: Optional[str]
    tags: Set[str]
    updated_ts: float
    path_segments: Sequence[str]

    def to_payload(self) -> Dict[str, object]:
        updated_at = iso8601(self.updated_ts) if self.updated_ts else None
        tags = sorted({t for t in self.tags if t})
        description = self.description
        if not description:
            description = build_default_description(
                self.title,
                self.category,
                self.version,
                self.version_dir,
                [fmt for fmt in ["PDF" if self.pdf_url else None, "HTML" if self.html_url else None] if fmt],
            )
        payload: Dict[str, object] = {
            "slug": self.slug,
            "title": self.title,
            "description": description,
        }
        if self.category:
            payload["category"] = self.category
        if self.version:
            payload["version"] = self.version
        if updated_at:
            payload["updatedAt"] = updated_at
        if self.collection_dir:
            payload["collection"] = self.collection_dir
        if self.collection_slug:
            payload["collectionSlug"] = self.collection_slug
        if self.collection_label:
            payload["collectionLabel"] = self.collection_label
        if self.pdf_url:
            payload["pdfUrl"] = self.pdf_url
        if self.html_url:
            payload["htmlUrl"] = self.html_url
        if self.language:
            payload["language"] = self.language
        if self.version_dir and (not self.version or self.version_dir != self.version.replace(" ", "-")):
            payload["variant"] = self.version_dir
        if self.version_slug:
            payload["versionSlug"] = self.version_slug
        if tags:
            payload["tags"] = tags
        if self.path_segments:
            payload["pathSegments"] = list(self.path_segments)
        return payload


def iso8601(ts: float) -> str:
    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def humanize_segment(segment: str) -> str:
    """Convert a path fragment into a presentation friendly label."""

    if not segment:
        return ""
    # Replace delimiters with spaces and split camelCase boundaries.
    cleaned = segment.replace("_", " ")
    cleaned = cleaned.replace("-", " ")
    cleaned = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", cleaned)
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned.strip()


def slugify(parts: Iterable[str]) -> str:
    tokens: List[str] = []
    for part in parts:
        cleaned = re.sub(r"[^A-Za-z0-9]+", "-", part)
        cleaned = cleaned.strip("-").lower()
        if cleaned:
            tokens.append(cleaned)
    return "-".join(tokens) or "doc"


def format_version_label(version_dir: Optional[str]) -> Optional[str]:
    if not version_dir:
        return None
    tokens = re.split(r"[-_]+", version_dir)
    if len(tokens) >= 2:
        return " ".join(tokens[:2])
    return humanize_segment(version_dir)


def build_version_slug(version_dir: Optional[str], version_label: Optional[str]) -> Optional[str]:
    """Generate a stable slug for the version route segment."""

    candidate = version_dir or version_label
    if not candidate:
        return None
    return slugify([candidate])


def detect_language(version_dir: Optional[str]) -> Optional[str]:
    if not version_dir:
        return None
    match = re.match(r"^([A-Z]{2,3})(?:-|$)", version_dir)
    if not match:
        return None
    token = match.group(1)
    if len(token) <= 3:
        return token
    return None


def build_default_description(
    title: str,
    category: Optional[str],
    version: Optional[str],
    version_dir: Optional[str],
    formats: Sequence[str],
) -> str:
    lead = title or "Documentation"
    if category:
        lead = f"{lead} — {category}"
    suffix_parts: List[str] = []
    if version:
        suffix_parts.append(f"edition {version}")
    elif version_dir:
        suffix_parts.append(f"edition {humanize_segment(version_dir)}")
    if version_dir and version_dir != humanize_segment(version_dir):
        suffix_parts.append(f"release {version_dir}")
    if formats:
        if len(formats) == 1:
            suffix_parts.append(f"available as {formats[0]}")
        else:
            suffix_parts.append(f"available as {' and '.join(formats)}")
    suffix = ", ".join(suffix_parts)
    if suffix:
        return f"{lead} ({suffix})."
    return f"{lead}."


def should_skip(path: Path) -> bool:
    return any(part.startswith(".") for part in path.parts)


def build_url(root: Path, file_path: Path, base_prefix: str) -> str:
    rel = file_path.relative_to(root).as_posix()
    # Ensure a single leading slash before appending to prefix.
    rel = "/" + rel.lstrip("/")
    prefix = base_prefix.rstrip("/")
    if prefix:
        return f"{prefix}{rel}"
    return rel


def create_entry(parts: Tuple[str, ...]) -> DocEntry:
    category = humanize_segment(parts[0]) if parts else None
    version_dir = parts[1] if len(parts) > 1 else None
    collection_dir = parts[0] if parts else None
    collection_slug = slugify(parts[:1]) if parts else None
    collection_label = humanize_segment(collection_dir or "") if collection_dir else None
    version_label = format_version_label(version_dir)
    version_slug = build_version_slug(version_dir, version_label)
    title = humanize_segment(parts[-1]) if parts else ""
    language = detect_language(version_dir)

    tags: Set[str] = set()
    if category:
        tags.add(category)
    if language:
        tags.add(language)
    if version_label:
        tags.add(version_label)

    return DocEntry(
        slug=slugify(parts),
        title=title or parts[-1],
        category=category,
        version=version_label,
        version_dir=version_dir,
        version_slug=version_slug,
        collection_dir=collection_dir,
        collection_slug=collection_slug,
        collection_label=collection_label,
        language=language,
        description="",
        pdf_url=None,
        html_url=None,
        tags=tags,
        updated_ts=0.0,
        path_segments=parts[:-1],
    )


def collect_docs(root: Path, base_prefix: str) -> List[DocEntry]:
    entries: Dict[Tuple[str, ...], DocEntry] = {}

    for file_path in root.rglob("*"):
        if not file_path.is_file():
            continue
        if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
            continue
        rel = file_path.relative_to(root)
        if should_skip(rel):
            continue

        parts = rel.parts[:-1] + (file_path.stem,)
        if not parts:
            continue
        key = tuple(parts)
        entry = entries.get(key)
        if entry is None:
            entry = create_entry(key)
            entries[key] = entry

        url = build_url(root, file_path, base_prefix)
        if file_path.suffix.lower() == ".pdf":
            entry.pdf_url = url
        elif file_path.suffix.lower() == ".html":
            entry.html_url = url

        try:
            mtime = file_path.stat().st_mtime
        except FileNotFoundError:
            mtime = 0.0
        entry.updated_ts = max(entry.updated_ts, mtime)

    return sorted(entries.values(), key=lambda e: (-e.updated_ts, e.slug))


def write_manifest(output_path: Path, entries: Sequence[DocEntry]) -> None:
    payload = [entry.to_payload() for entry in entries]
    tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
    with tmp_path.open("w", encoding="utf-8") as fh:
        json.dump(payload, fh, ensure_ascii=False, indent=2)
        fh.write("\n")
    tmp_path.replace(output_path)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate documentation manifest (all.json)")
    parser.add_argument("--root", required=True, help="Root directory of the docs tree (e.g. /data/update-server/docs)")
    parser.add_argument("--base-url-prefix", default="/docs", help="URL prefix to prepend to asset paths")
    parser.add_argument("--output", default="all.json", help="Output filename (default: all.json)")
    parser.add_argument("--quiet", action="store_true", help="Suppress progress output")
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    root = Path(args.root).resolve()
    if not root.exists() or not root.is_dir():
        raise SystemExit(f"Root path does not exist or is not a directory: {root}")

    entries = collect_docs(root, args.base_url_prefix)

    if not args.quiet:
        print(f"Discovered {len(entries)} documentation entries under {root}")

    output_path = root / args.output
    write_manifest(output_path, entries)

    if not args.quiet:
        print(f"Wrote {output_path}")


if __name__ == "__main__":
    main()