Skip to main content
Sherif Cherfa & Sarah Chieng
February 12, 2026
Open in Github
Verifying the validity of your docs is a key part of mantaining good documentation. However, manually auditing docs is tedious and error-prone. This notebook builds an automated docs checker that crawls your documentation site with Browserbase and analyzes each page with Cerebras for quality issues. To get started, you’ll need the following API keys:

Step 1: Environment Setup + API Keys

Install dependencies and configure Cerebras, Browserbase, and Langchain
!pip install -q playwright browserbase stagehand langchain-community cerebras-cloud-sdk pandas pydantic GitPython
!python -m playwright install chromium
import os
import asyncio
from datetime import datetime
from collections import deque, Counter
from urllib.parse import urlparse

import httpx
import pandas as pd

from pydantic import BaseModel, Field
from playwright.async_api import async_playwright
from IPython.display import display, clear_output, Markdown
from typing import Optional, List, Literal
Set your Cerebras and Browserbase credentials. You can also set these as environment variables.
CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY", "YOUR_CEREBRAS_API_KEY")
BROWSERBASE_API_KEY = os.getenv("BROWSERBASE_API_KEY", "YOUR_BROWSERBASE_API_KEY")
BROWSERBASE_PROJECT_ID = os.getenv("BROWSERBASE_PROJECT_ID", "YOUR_BROWSERBASE_PROJECT_ID")
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", "YOUR_GITHUB_TOKEN")

CEREBRAS_MODEL = "gpt-oss-120b"

print(f"✓ Configuration loaded")
print(f"  Model: {CEREBRAS_MODEL}")

Step 2: Define Data Models

First, create pydantic models to capture crawl results and detect issues. Concretely, the checker will detect the following categories of issues:
  • Unresolved references: links, anchors, or assets that fail to resolve.
  • Invalid snippets: code or structured blocks that are syntactically invalid or contain obvious placeholders.
  • Source-of-truth mismatches: claims in the docs that do not match an authoritative source (for example, a repository).
  • Cross-page inconsistencies: factual contradictions across different documentation pages.
  • Language errors: clear spelling or grammatical errors.
  • Missing required elements: required context (such as authentication or installation steps) that is absent based on predefined rules.
class Issue(BaseModel):
    category: str
    description: str
    page_url: str
    location: Optional[str] = None
    observed: Optional[str] = None
    expected: Optional[str] = None
    evidence: List[str] = Field(default_factory=list)

class LinkResult(BaseModel):
    url: str
    ok: bool
    status: Optional[int] = None
    error: Optional[str] = None

class PageResult(BaseModel):
    url: str
    issues: List[Issue] = Field(default_factory=list)
    links: List[LinkResult] = Field(default_factory=list)

print("✓ Data models defined")

Step 3: Crawl Documentation with Browserbase

Use a breadth-first search to discover pages, extract content, and detect broken links during the crawl itself.
from browserbase import Browserbase

bb = Browserbase(api_key=BROWSERBASE_API_KEY)
session = bb.sessions.create(project_id=BROWSERBASE_PROJECT_ID)
BROWSERBASE_CDP_URL = session.connect_url


async def check_link(client: httpx.AsyncClient, url: str) -> LinkResult:
    """Return a LinkResult for whether this URL resolves (lightweight HTTP check)."""
    try:
        r = await client.head(url, follow_redirects=True, timeout=8)
        return LinkResult(url=url, ok=(r.status_code < 400), status=r.status_code, error=None)
    except Exception as e:
        return LinkResult(url=url, ok=False, status=None, error=str(e))


async def crawl(root_url: str, max_pages: int = 30, max_depth: int = 2) -> list[PageResult]:
    """
      1) Crawl pages (BFS)
      2) Collect links
      3) Check which links resolve (lightweight HTTP check)
      4) Emit PageResult with links + unresolved_reference issues
    """
    base_domain = urlparse(root_url).netloc

    visited: dict[str, PageResult] = {}
    queue = deque([(root_url, 0)])

    async with async_playwright() as p:
        browser = await p.chromium.connect_over_cdp(BROWSERBASE_CDP_URL)
        page = await browser.new_page()

        async with httpx.AsyncClient() as http:
            while queue and len(visited) < max_pages:
                url, depth = queue.popleft()
                url = url.split("#")[0]

                if url in visited or depth > max_depth:
                    continue

                clear_output(wait=True)
                print(f"Crawling [{len(visited)+1}/{max_pages}] (depth={depth}): {url[:80]}")

                result = PageResult(url=url)

                try:
                    await page.goto(url, timeout=15000, wait_until="domcontentloaded")

                    # Collect absolute links from the rendered page
                    links = await page.eval_on_selector_all(
                        "a[href]",
                        "els => els.map(e => e.href)"
                    )

                    # Check a capped number of links to keep this step cheap
                    for href in links[:30]:
                        if not href.startswith("http"):
                            continue

                        lr = await check_link(http, href)
                        result.links.append(lr)

                        if not lr.ok:
                            result.issues.append(
                                Issue(
                                    category="unresolved_reference",
                                    description=f"Link did not return a successful HTTP status (status={lr.status})",
                                    page_url=url,
                                    location=None,
                                    evidence=[href],
                                )
                            )

                        # Queue internal links for crawling
                        parsed = urlparse(href)
                        if parsed.netloc == base_domain:
                            next_url = href.split("#")[0]
                            if next_url not in visited:
                                queue.append((next_url, depth + 1))

                except Exception as e:
                    result.issues.append(
                        Issue(
                            category="unresolved_reference",
                            description="Page failed to load",
                            page_url=url,
                            location=None,
                            evidence=[str(e)],
                        )
                    )

                visited[url] = result

        await browser.close()

    clear_output(wait=True)
    print(f"✓ Crawled {len(visited)} pages")
    return list(visited.values())

Step 5: AI Analysis with Cerebras

Next, use Stagehand (for agentic browser automation) powered by Cerebras (fast inference) to analyze content to identify deeper issues: outdated information, unclear writing, missing context, and grammar errors. The analysis uses structured outputs for reliable JSON parsing.
from cerebras.cloud.sdk import Cerebras

cerebras = Cerebras(api_key=CEREBRAS_API_KEY)
STAGEHAND_MODEL = f"cerebras/{CEREBRAS_MODEL}"

from IPython.display import clear_output
from stagehand import AsyncStagehand

ALLOWED_CATEGORIES = [
    "invalid_snippet",
    "source_of_truth_mismatch",
    "cross_page_inconsistency",
    "language_error",
    "missing_required_element",
]

ISSUES_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "category": {"type": "string", "enum": ALLOWED_CATEGORIES},
            "description": {"type": "string"},
            "location": {"type": ["string", "null"]},
            "observed": {"type": ["string", "null"]},
            "expected": {"type": ["string", "null"]},
            "evidence": {"type": "array", "items": {"type": "string"}},
        },
        "required": ["category", "description", "location", "observed", "expected", "evidence"],
        "additionalProperties": False,
    },
}


async def analyze_one_with_stagehand(pr: PageResult) -> PageResult:
    """
    Step 5: Use Stagehand (Cerebras-backed) to add objective issues.
    IMPORTANT: Do not report unresolved_reference here (Step 4 already did).
    """
    sh = AsyncStagehand(
        browserbase_api_key=BROWSERBASE_API_KEY,
        browserbase_project_id=BROWSERBASE_PROJECT_ID,
        model_api_key=CEREBRAS_API_KEY,
    )
    session = await sh.sessions.start(model_name=STAGEHAND_MODEL)

    try:
        await session.navigate(url=pr.url)

        instruction = f"""
You are reviewing a documentation page. Report ONLY objective issues.
URL: {pr.url}

Do NOT report broken links, broken anchors, or page load failures. Those are handled earlier.

Return issues only in these categories:
- invalid_snippet: invalid JSON/YAML, malformed code fences, obvious placeholders like TODO/<YOUR_KEY>
- source_of_truth_mismatch: contradicts an authoritative source (if provided)
- cross_page_inconsistency: contradicts other docs (if known)
- language_error: clear spelling/grammar errors (not style/tone)
- missing_required_element: missing required element based on explicit rules (e.g. API request shown but no auth mention)

For each issue include:
- location: best-effort section heading or a short locator string
- observed: the problematic text/snippet
- expected: what should be true instead (if applicable)
- evidence: short direct quotes/snippets
""".strip()

        resp = await session.extract(instruction=instruction, schema=ISSUES_SCHEMA)
        items = resp.data.result if resp and resp.data else []

        if isinstance(items, list):
            for item in items:
                if not isinstance(item, dict):
                    continue
                pr.issues.append(
                    Issue(
                        category=item["category"],
                        description=item["description"],
                        page_url=pr.url,
                        location=item.get("location"),
                        observed=item.get("observed"),
                        expected=item.get("expected"),
                        evidence=item.get("evidence", []),
                    )
                )
    finally:
        await session.end()

    return pr


async def analyze_all_optimized(page_results):
    """Single session for all pages = much faster"""
    print(f"Analyzing {len(page_results)} pages with single session...")

    sh = AsyncStagehand(
        browserbase_api_key=BROWSERBASE_API_KEY,
        browserbase_project_id=BROWSERBASE_PROJECT_ID,
        model_api_key=CEREBRAS_API_KEY)

    session = await sh.sessions.start(model_name=STAGEHAND_MODEL)
    print(f"Session: https://browserbase.com/sessions/{getattr(session, 'session_id', 'N/A')}")

    try:
        for i, pr in enumerate(page_results):
            print(f"[{i+1}/{len(page_results)}] {pr.url[:60]}...")
            try:
                await session.navigate(url=pr.url)
                resp = await session.extract(
                    instruction=f"Review {pr.url} for issues. Skip broken links.",
                    schema=ISSUES_SCHEMA)
                items = resp.data.result if resp and resp.data else []
                for item in (items if isinstance(items, list) else []):
                    if isinstance(item, dict):
                        pr.issues.append(Issue(
                            category=item["category"],
                            description=item["description"],
                            page_url=pr.url,
                            location=item.get("location"), observed=item.get("observed"), expected=item.get("expected"),
                            evidence=item.get("evidence", [])))
            except Exception as e:
                print(f"  Error: {e}")
    finally:
        await session.end()

    print(f"Done! {sum(len(p.issues) for p in page_results)} issues")
    return page_results

print("Ready! Use: page_results = await analyze_all_optimized(page_results)")

Step 6: Display Results

Finally, let’s write three functions to view and export findings.
  • summarize: Provides high-level statistics across all pages without listing individual issues
  • show_issues: Displays all issues in a structured table, with optional filtering by severity (critical, high, medium, low)
  • export_issues: Combines both views into a readable Markdown report with overall summaries and issue-level details
def collect_issues(page_results: list[PageResult]) -> list[Issue]:
    """Flatten issues across all PageResults."""
    all_issues: list[Issue] = []
    for pr in page_results:
        all_issues.extend(pr.issues)
    return all_issues


def summarize(page_results: list[PageResult], root_url: str):
    """Display summary statistics (objective categories only)."""
    issues = collect_issues(page_results)

    category_counts = Counter(i.category for i in issues)
    unresolved = category_counts.get("unresolved_reference", 0)

    md = f"""### Summary for {root_url}

| Metric | Value |
|--------|-------|
| Pages crawled | {len(page_results)} |
| Total issues | {len(issues)} |
| Unresolved references | {unresolved} |

**By category:** {', '.join(f'{k}: {v}' for k, v in category_counts.most_common())}
"""
    display(Markdown(md))


def show_issues(page_results: list[PageResult], category_filter: str = None, limit: int = 200):
    """
    Display issues as a table, optionally filtered by category.
    """
    issues = collect_issues(page_results)
    filtered = [i for i in issues if category_filter is None or i.category == category_filter]

    if not filtered:
        print(f"No {category_filter + ' ' if category_filter else ''}issues found.")
        return

    # Keep display compact + useful
    rows = []
    for i in filtered[:limit]:
        page_name = i.page_url.split("/")[-1] or "index"
        rows.append({
            "Category": i.category,
            "Page": page_name,
            "URL": i.page_url,
            "Location": i.location or "",
            "Description": (i.description[:90] + "…") if len(i.description) > 90 else i.description,
            "Observed": (i.observed[:70] + "…") if i.observed and len(i.observed) > 70 else (i.observed or ""),
            "Expected": (i.expected[:70] + "…") if i.expected and len(i.expected) > 70 else (i.expected or ""),
            "Evidence": (i.evidence[0][:70] + "…") if i.evidence else "",
        })

    df = pd.DataFrame(rows)
    display(df)


def export_markdown(page_results: list[PageResult], root_url: str) -> str:
    """Generate a Markdown report (objective + evidence-based)."""
    issues = collect_issues(page_results)
    category_counts = Counter(i.category for i in issues)

    report = f"""# Documentation Analysis Report

**Site:** {root_url}
**Date:** {datetime.now().strftime("%Y-%m-%d %H:%M")}
**Pages:** {len(page_results)} | **Issues:** {len(issues)}

## Summary

| Category | Count |
|----------|-------|
"""
    for cat, cnt in category_counts.most_common():
        report += f"| {cat} | {cnt} |\n"

    report += "\n## Issues\n\n"

    # Group by category for readability
    for cat, cnt in category_counts.most_common():
        report += f"### {cat} ({cnt})\n\n"
        cat_issues = [i for i in issues if i.category == cat]

        for i in cat_issues:
            page_name = i.page_url.split("/")[-1] or "index"
            report += f"- **[{page_name}]** {i.description}\n"
            if i.location:
                report += f"  - Location: `{i.location}`\n"
            if i.observed:
                report += f"  - Observed: `{i.observed}`\n"
            if i.expected:
                report += f"  - Expected: `{i.expected}`\n"
            if i.evidence:
                # keep evidence readable; don’t dump giant blobs
                ev = i.evidence[:2]
                report += f"  - Evidence: {', '.join(f'`{e}`' for e in ev)}\n"
        report += "\n"

    return report
#@title Report Display
from IPython.display import HTML, display

# Cerebras Brand Colors
CEREBRAS_DARK = "#1a1a2e"
CEREBRAS_ORANGE = "#f97316"
CEREBRAS_GRAY = "#2d2d44"
CEREBRAS_TEXT = "#e5e5e5"

def display_styled_report(page_results: list, root_url: str):
    """Generate a beautiful Cerebras-branded HTML report."""
    issues = collect_issues(page_results)
    category_counts = Counter(i.category for i in issues)

    # Build summary rows
    summary_rows = "".join([
        f'<tr><td style="padding:8px;border-bottom:1px solid {CEREBRAS_GRAY};">{cat}</td>'
        f'<td style="padding:8px;border-bottom:1px solid {CEREBRAS_GRAY};text-align:right;">{cnt}</td></tr>'
        for cat, cnt in category_counts.most_common()
    ])

    # Build issues HTML
    issues_html = ""
    for cat, cnt in category_counts.most_common():
        cat_issues = [i for i in issues if i.category == cat]
        issues_html += f'''
        <div style="margin-top:20px;">
            <h3 style="color:{CEREBRAS_ORANGE};border-left:4px solid {CEREBRAS_ORANGE};padding-left:10px;">
                {cat} ({cnt})
            </h3>
        '''
        for i in cat_issues[:10]:  # Limit to 10 per category
            page_name = i.page_url.split("/")[-1] or "index"
            issues_html += f'''
            <div style="background:{CEREBRAS_GRAY};padding:12px;margin:8px 0;border-radius:6px;">
                <strong style="color:{CEREBRAS_ORANGE};">[{page_name}]</strong>
                <span style="color:{CEREBRAS_TEXT};">{i.description[:100]}...</span>
            </div>
            '''
        issues_html += "</div>"

    html = f'''
    <div style="background:{CEREBRAS_DARK};color:{CEREBRAS_TEXT};padding:30px;border-radius:12px;font-family:system-ui;">
        <h1 style="color:{CEREBRAS_ORANGE};margin-bottom:5px;">Documentation Analysis Report</h1>
        <p style="color:#888;margin-top:0;">Site: <a href="{root_url}" style="color:{CEREBRAS_ORANGE};">{root_url}</a></p>
        <p style="color:#888;">Pages: {len(page_results)} | Issues: {len(issues)}</p>

        <h2 style="color:{CEREBRAS_TEXT};border-bottom:2px solid {CEREBRAS_ORANGE};padding-bottom:10px;">Summary</h2>
        <table style="width:100%;border-collapse:collapse;margin:15px 0;">
            <tr style="background:{CEREBRAS_GRAY};">
                <th style="padding:10px;text-align:left;">Category</th>
                <th style="padding:10px;text-align:right;">Count</th>
            </tr>
            {summary_rows}
        </table>

        <h2 style="color:{CEREBRAS_TEXT};border-bottom:2px solid {CEREBRAS_ORANGE};padding-bottom:10px;margin-top:30px;">Issues</h2>
        {issues_html}

        <div style="margin-top:30px;padding-top:20px;border-top:1px solid {CEREBRAS_GRAY};text-align:center;color:#666;">
            Powered by Cerebras + Browserbase
        </div>
    </div>
    '''
    display(HTML(html))

print("Styled display ready! Use: display_styled_report(page_results, DOCS_URL)")

Step 7: Run the Analysis

As a last step, configure your target docs site and run the full pipeline.
# Configure your target
DOCS_URL = "https://docs.browser-use.com/"
MAX_PAGES = 30
MAX_DEPTH = 2

# Crawl
page_results = await crawl(
    DOCS_URL,
    max_pages=MAX_PAGES,
    max_depth=MAX_DEPTH,
)
# Analyze
page_results = await analyze_all_optimized(page_results)

# Show results
summarize(page_results, DOCS_URL)
show_issues(page_results)
# Export report
report_md = export_markdown(page_results, DOCS_URL)
display(Markdown(report_md))
This is a tutorial implementation. For production use, add rate limiting, authentication handling, improved crawl controls, and stricter output validation.