dotfiles/pi/files/agent/extensions/pi-web-access/rsc-extract.ts

/**
 * RSC Content Extractor
 *
 * Extracts readable content from Next.js React Server Components (RSC) flight payloads.
 * RSC pages embed content as JSON in <script>self.__next_f.push([...])</script> tags.
 */

export interface RSCExtractResult {
  title: string;
  content: string;
}

export function extractRSCContent(html: string): RSCExtractResult | null {
  if (!html.includes("self.__next_f.push")) {
    return null;
  }

  // Parse all RSC chunks into a map
  const chunkMap = new Map<string, string>();
  const scriptRegex = /<script>self\.__next_f\.push\(\[1,"([\s\S]*?)"\]\)<\/script>/g;

  for (const match of html.matchAll(scriptRegex)) {
    let content: string;
    try {
      content = JSON.parse('"' + match[1] + '"');
    } catch {
      continue;
    }

    // Parse each line as "id:payload"
    // Lines are separated by \n, each line is one chunk
    // Chunk IDs are hex strings, typically 1-4 chars (supports up to 65535 chunks)
    for (const line of content.split("\n")) {
      if (!line.trim()) continue;

      const colonIdx = line.indexOf(":");
      if (colonIdx <= 0 || colonIdx > 4) continue;

      const id = line.slice(0, colonIdx);
      if (!/^[0-9a-f]+$/i.test(id)) continue;

      const payload = line.slice(colonIdx + 1);
      if (!payload) continue;

      const existing = chunkMap.get(id);
      if (!existing || payload.length > existing.length) {
        chunkMap.set(id, payload);
      }
    }
  }

  if (chunkMap.size === 0) return null;

  // Extract title
  const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
  const title = titleMatch?.[1]?.split("|")[0]?.trim() || "";

  // Parse and cache parsed chunks
  const parsedCache = new Map<string, unknown>();

  function getParsedChunk(id: string): unknown | null {
    if (parsedCache.has(id)) return parsedCache.get(id);

    const chunk = chunkMap.get(id);
    if (!chunk || !chunk.startsWith("[")) {
      parsedCache.set(id, null);
      return null;
    }

    try {
      const parsed = JSON.parse(chunk);
      parsedCache.set(id, parsed);
      return parsed;
    } catch {
      parsedCache.set(id, null);
      return null;
    }
  }

  // Extract markdown from nodes, resolving refs on the fly
  type Node = unknown;
  const visitedRefs = new Set<string>();

  function extractNode(node: Node, ctx = { inTable: false, inCode: false }): string {
    if (node === null || node === undefined) return "";

    if (typeof node === "string") {
      // Check if it's a reference like "$L30"
      const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
      if (refMatch) {
        const refId = refMatch[1];
        if (visitedRefs.has(refId)) return ""; // Prevent cycles
        visitedRefs.add(refId);
        const refNode = getParsedChunk(refId);
        const result = refNode ? extractNode(refNode, ctx) : "";
        visitedRefs.delete(refId);
        return result;
      }
      // Filter out RSC-specific artifacts, but preserve content inside code blocks
      if (!ctx.inCode && (node === "$undefined" || node === "$" || /^\$[A-Z]/.test(node))) return "";
      return node.trim() ? node : "";
    }

    if (typeof node === "number") return String(node);
    if (typeof node === "boolean") return "";
    if (!Array.isArray(node)) return "";

    // RSC element: ["$", "tag", key, props]
    if (node[0] === "$" && typeof node[1] === "string") {
      const tag = node[1] as string;
      const props = (node[3] || {}) as Record<string, unknown>;

      // Skip non-content
      const skipTags = ["script", "style", "svg", "path", "circle", "link", "meta",
                        "template", "button", "input", "nav", "footer", "aside"];
      if (skipTags.includes(tag)) return "";

      // Component ref like $L25
      if (tag.startsWith("$L")) {
        const refId = tag.slice(2);
        if (visitedRefs.has(refId)) return "";

        // Check for heading components with baseId
        if (props.baseId && props.children) {
          return `## ${String(props.children)}\n\n`;
        }

        visitedRefs.add(refId);
        const refNode = getParsedChunk(refId);
        let result = "";
        if (refNode) {
          result = extractNode(refNode, ctx);
        } else if (props.children) {
          result = extractNode(props.children as Node, ctx);
        }
        visitedRefs.delete(refId);
        return result;
      }

      const children = props.children;
      const content = children ? extractNode(children as Node, ctx) : "";

      switch (tag) {
        case "h1": return `# ${content.trim()}\n\n`;
        case "h2": return `## ${content.trim()}\n\n`;
        case "h3": return `### ${content.trim()}\n\n`;
        case "h4": return `#### ${content.trim()}\n\n`;
        case "h5": return `##### ${content.trim()}\n\n`;
        case "h6": return `###### ${content.trim()}\n\n`;
        case "p": return ctx.inTable ? content : `${content.trim()}\n\n`;
        case "code": {
          const codeContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
          return ctx.inCode ? codeContent : `\`${codeContent}\``;
        }
        case "pre": {
          const preContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
          return "```\n" + preContent + "\n```\n\n";
        }
        case "strong": case "b": return `**${content}**`;
        case "em": case "i": return `*${content}*`;
        case "li": return `- ${content.trim()}\n`;
        case "ul": case "ol": return content + "\n";
        case "blockquote": return `> ${content.trim()}\n\n`;
        case "table": return extractTable(node as unknown[]) + "\n";
        case "thead": case "tbody": case "tr": case "th": case "td":
          return content;
        case "div":
          if (props.role === "alert" || props["data-slot"] === "alert") {
            return `> ${content.trim()}\n\n`;
          }
          return content;
        case "a": {
          const href = props.href as string | undefined;
          return href && !href.startsWith("#") ? `[${content}](${href})` : content;
        }
        default: return content;
      }
    }

    // Array of child nodes
    return (node as Node[]).map(n => extractNode(n, ctx)).join("");
  }

  function extractTable(tableNode: unknown[]): string {
    const props = (tableNode[3] || {}) as Record<string, unknown>;
    const rows: string[][] = [];
    let headerRowCount = 0;

    function walkTable(node: unknown, isHeader = false): void {
      if (node === null || node === undefined) return;

      // Handle string refs
      if (typeof node === "string") {
        const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
        if (refMatch && !visitedRefs.has(refMatch[1])) {
          visitedRefs.add(refMatch[1]);
          const refNode = getParsedChunk(refMatch[1]);
          if (refNode) walkTable(refNode, isHeader);
          visitedRefs.delete(refMatch[1]);
        }
        return;
      }

      if (!Array.isArray(node)) return;

      if (node[0] === "$") {
        const tag = node[1] as string;
        const nodeProps = (node[3] || {}) as Record<string, unknown>;

        // Handle component refs
        if (tag.startsWith("$L")) {
          const refId = tag.slice(2);
          if (!visitedRefs.has(refId)) {
            visitedRefs.add(refId);
            const refNode = getParsedChunk(refId);
            if (refNode) walkTable(refNode, isHeader);
            visitedRefs.delete(refId);
          }
          return;
        }

        if (tag === "thead") walkTable(nodeProps.children, true);
        else if (tag === "tbody") walkTable(nodeProps.children, false);
        else if (tag === "tr") {
          const cells: string[] = [];
          walkCells(nodeProps.children, cells);
          if (cells.length > 0) {
            rows.push(cells);
            if (isHeader) headerRowCount++;
          }
        } else walkTable(nodeProps.children, isHeader);
      } else {
        for (const child of node) walkTable(child, isHeader);
      }
    }

    function walkCells(node: unknown, cells: string[]): void {
      if (node === null || node === undefined) return;

      // Handle string refs
      if (typeof node === "string") {
        const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
        if (refMatch && !visitedRefs.has(refMatch[1])) {
          visitedRefs.add(refMatch[1]);
          const refNode = getParsedChunk(refMatch[1]);
          if (refNode) walkCells(refNode, cells);
          visitedRefs.delete(refMatch[1]);
        }
        return;
      }

      if (!Array.isArray(node)) return;

      if (node[0] === "$" && (node[1] === "td" || node[1] === "th")) {
        const cellProps = (node[3] || {}) as Record<string, unknown>;
        const text = extractNode(cellProps.children, { inTable: true, inCode: false })
          .trim()
          .replace(/\n/g, " ")
          .replace(/\\/g, "\\\\")  // Escape backslashes first
          .replace(/\|/g, "\\|");  // Then escape pipes
        cells.push(text);
      } else if (node[0] === "$" && typeof node[1] === "string" && (node[1] as string).startsWith("$L")) {
        // Component ref for a cell
        const refId = (node[1] as string).slice(2);
        if (!visitedRefs.has(refId)) {
          visitedRefs.add(refId);
          const refNode = getParsedChunk(refId);
          if (refNode) walkCells(refNode, cells);
          visitedRefs.delete(refId);
        }
      } else {
        for (const child of node) walkCells(child, cells);
      }
    }

    walkTable(props.children);
    if (rows.length === 0) return "";

    const colCount = Math.max(...rows.map(r => r.length));
    let md = "";
    for (let i = 0; i < rows.length; i++) {
      const row = rows[i].concat(Array(colCount - rows[i].length).fill(""));
      md += "| " + row.join(" | ") + " |\n";
      if (i === headerRowCount - 1 || (headerRowCount === 0 && i === 0)) {
        md += "| " + Array(colCount).fill("---").join(" | ") + " |\n";
      }
    }
    return md;
  }

  // Process main content chunk (usually "23")
  const mainChunk = getParsedChunk("23");

  if (mainChunk) {
    const content = extractNode(mainChunk);
    if (content.trim().length > 100) {
      const cleaned = content
        .replace(/\n{3,}/g, "\n\n")
        .trim();
      return { title, content: cleaned };
    }
  }

  // Fallback: try other chunks
  const contentParts: { order: number; text: string }[] = [];

  for (const [id] of chunkMap) {
    if (id === "23") continue;
    const parsed = getParsedChunk(id);
    if (!parsed) continue;

    visitedRefs.clear();
    const text = extractNode(parsed);

    if (text.trim().length > 50 &&
        !text.includes("page was not found") &&
        !text.includes("404")) {
      contentParts.push({ order: parseInt(id, 16), text: text.trim() });
    }
  }

  if (contentParts.length === 0) return null;

  contentParts.sort((a, b) => a.order - b.order);

  const seen = new Set<string>();
  const uniqueParts: string[] = [];
  for (const part of contentParts) {
    const key = part.text.slice(0, 150);
    if (!seen.has(key)) {
      seen.add(key);
      uniqueParts.push(part.text);
    }
  }

  const content = uniqueParts.join("\n\n").replace(/\n{3,}/g, "\n\n").trim();
  return content.length > 100 ? { title, content } : null;
}