安装依赖

bash

pip install playwright pikepdf
playwright install chromium

代码

python

from playwright.sync_api import sync_playwright
from urllib.parse import urljoin, urlparse
import os
import re
import tempfile
import shutil

class DocusaurusCrawler:
    def __init__(self, start_url, pdf_file_name):
        self.start_url = start_url
        self.pdf_file_name = pdf_file_name
        self.base_domain = urlparse(start_url).netloc
        self.visited_urls = set()
        self.pages = []
    
    def close_blocking_overlays(self, page):
        """关闭可能遮挡点击的浮层、遮罩、弹窗等"""
        page.evaluate("""() => {
            // 关闭所有可能的浮层和遮罩
            const selectors = [
                '[data-floating-ui-portal]',
                '[role="dialog"]',
                '[role="alertdialog"]',
                '.modal',
                '.overlay',
                '.backdrop',
                '.toast',
                '.notification',
                // 常见的遮罩层 class
                '.fixed.inset-0',
                '.absolute.inset-0',
                'div[class*="mask"]',
                'div[class*="overlay"]',
                'div[class*="backdrop"]',
                'div[class*="portal"]'
            ];
            
            selectors.forEach(selector => {
                document.querySelectorAll(selector).forEach(el => {
                    // 方法1: 移除元素
                    el.remove();
                    // 方法2: 隐藏元素
                    // el.style.display = 'none';
                    // el.style.visibility = 'hidden';
                    // el.style.pointerEvents = 'none';
                });
            });
            
            // 确保 body 可点击
            document.body.style.pointerEvents = 'auto';
            document.body.style.overflow = 'auto';
        }""")
        
        # 按 ESC 键关闭可能的弹窗
        page.keyboard.press('Escape')
        page.wait_for_timeout(300)

    def expand_all_menus(self, page):
        """通过 JavaScript 强制点击展开所有菜单（绕过浮层遮挡）"""
        print("   🔓 展开所有折叠菜单...")
        
        max_attempts = 25
        for attempt in range(max_attempts):
            # 先关闭可能的遮挡浮层
            self.close_blocking_overlays(page)
            
            # 获取当前折叠项数量
            collapsed_count = page.evaluate("""() => {
                return document.querySelectorAll('.menu__list-item--collapsed').length;
            }""")
            
            if collapsed_count == 0:
                print(f"   ✅ 所有菜单已展开（{attempt} 次尝试）")
                break
            
            print(f"   第 {attempt+1} 次：还有 {collapsed_count} 个折叠项")
            
            # 使用 JS 点击（绕过 Playwright 的点击检查）
            clicked = page.evaluate("""() => {
                const item = document.querySelector('.menu__list-item--collapsed');
                if (!item) return false;
                
                // 找到可点击的元素
                const clickable = item.querySelector('.menu__link--sublist, .menu__caret, button');
                if (!clickable) {
                    // 如果没有特定点击元素，直接移除折叠标记
                    item.classList.remove('menu__list-item--collapsed');
                    return true;
                }
                
                // 触发点击事件
                const events = ['mousedown', 'click', 'mouseup'];
                events.forEach(type => {
                    const evt = new MouseEvent(type, {
                        bubbles: true,
                        cancelable: true,
                        view: window
                    });
                    clickable.dispatchEvent(evt);
                });
                
                // 同时直接修改 class
                item.classList.remove('menu__list-item--collapsed');
                
                // 显示子菜单
                const submenu = item.querySelector('.menu__list');
                if (submenu) {
                    submenu.style.display = 'block';
                    submenu.style.height = 'auto';
                }
                
                return true;
            }""")
            
            if not clicked:
                print("   ⚠️ 无法点击，尝试备用方案...")
                break
            
            # 等待 React 响应
            page.wait_for_timeout(800)
        
        # 最终强制展开所有剩余项
        final_count = page.evaluate("""() => {
            const items = document.querySelectorAll('.menu__list-item--collapsed');
            const count = items.length;
            
            items.forEach(item => {
                item.classList.remove('menu__list-item--collapsed');
                const submenu = item.querySelector('.menu__list');
                if (submenu) {
                    submenu.style.display = 'block';
                    submenu.style.height = 'auto';
                }
            });
            
            // 展开所有子列表
            document.querySelectorAll('.menu__link--sublist').forEach(link => {
                link.setAttribute('aria-expanded', 'true');
            });
            
            document.querySelectorAll('.menu__list').forEach(list => {
                list.style.display = 'block';
            });
            
            return count;
        }""")
        
        if final_count > 0:
            print(f"   强制展开了 {final_count} 个剩余项")
        
        page.wait_for_timeout(1000)

    def extract_sidebar_links(self, page):
        """提取侧边栏所有链接"""
        print("📑 提取导航目录...")
        
        page.wait_for_selector(".theme-doc-sidebar-menu", timeout=15000)
        page.wait_for_timeout(1000)
        
        # 先关闭浮层再展开菜单
        self.close_blocking_overlays(page)
        self.expand_all_menus(page)
        
        # 提取所有链接
        current_url = page.url.rstrip('/')
        
        links = page.evaluate("""(currentUrl) => {
            const links = [];
            const seen = new Set();
            
            document.querySelectorAll('.theme-doc-sidebar-menu a.menu__link[href]').forEach(link => {
                const url = link.href?.replace(/\\/$/, '');
                const title = link.textContent?.trim();
                
                if (!url || !title || url.includes('#')) return;
                
                // 计算层级深度
                let level = 1;
                let element = link;
                while (element) {
                    if (element.classList?.contains('menu__list-item')) {
                        const parent = element.parentElement?.closest('.menu__list-item');
                        if (parent) level++;
                    }
                    element = element.parentElement;
                }
                
                if (!seen.has(url)) {
                    seen.add(url);
                    links.push({ title, url, level: Math.min(level, 4) });
                }
            });
            
            if (!seen.has(currentUrl)) {
                const h1 = document.querySelector('h1')?.textContent?.trim() || '当前页面';
                links.unshift({ title: h1, url: currentUrl, level: 1 });
            }
            
            return links;
        }""", current_url)
        
        # 去重
        seen = set()
        unique_links = []
        for link in links:
            clean_url = link['url'].rstrip('/').split('#')[0]
            if clean_url not in seen and self.base_domain in clean_url and '/docs/' in clean_url:
                seen.add(clean_url)
                unique_links.append(link)
        
        print(f"✅ 共发现 {len(unique_links)} 个页面")
        
        # 分组显示
        current_section = ""
        section_num = 0
        for i, link in enumerate(unique_links[:80]):
            if link['level'] == 1:
                section_num += 1
                current_section = link['title']
                print(f"\n   【{section_num}】{link['title']}")
            else:
                indent = "    " + "  " * (link['level'] - 2)
                print(f"{indent}[{i+1}] {link['title'][:40]}")
        
        if len(unique_links) > 80:
            print(f"\n   ... 还有 {len(unique_links) - 80} 个页面")
        
        return unique_links

    def clean_content(self, html):
        html = re.sub(r'<a[^>]*aria-label="[^"]*Edit[^"]*"[^>]*>.*?</a>', '', html, flags=re.DOTALL)
        html = re.sub(r'<div[^>]*class="[^"]*last-updated[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
        html = re.sub(r'<nav[^>]*class="[^"]*pagination-nav[^"]*"[^>]*>.*?</nav>', '', html, flags=re.DOTALL)
        html = re.sub(r'<aside[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</aside>', '', html, flags=re.DOTALL)
        html = re.sub(r'<nav[^>]*class="[^"]*breadcrumbs[^"]*"[^>]*>.*?</nav>', '', html, flags=re.DOTALL)
        html = re.sub(r'<div[^>]*class="[^"]*theme-doc-breadcrumbs[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
        html = re.sub(r'<div[^>]*class="[^"]*toc[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
        html = re.sub(r'<div[^>]*class="[^"]*buttonGroup[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
        html = re.sub(r'<h1[^>]*>.*?</h1>', '', html, flags=re.DOTALL)
        return html

    def crawl_page(self, page, url, title_hint=None):
        if url in self.visited_urls:
            return None
        
        self.visited_urls.add(url)
        short_title = title_hint or url.split('/')[-1] or "页面"
        print(f"🌐 [{len(self.visited_urls)}] {short_title[:50]}...")
        
        try:
            page.goto(url, wait_until="networkidle", timeout=30000)
            page.wait_for_timeout(1500)
            
            page.wait_for_selector(".theme-doc-markdown, article", timeout=10000)
            
            try:
                title = page.inner_text('h1', timeout=5000)
            except:
                title = title_hint or "无标题"
            
            content_html = page.evaluate("""() => {
                let el = document.querySelector('.theme-doc-markdown');
                if (!el) el = document.querySelector('article.markdown');
                if (!el) el = document.querySelector('main article');
                if (!el) return '';
                const clone = el.cloneNode(true);
                clone.querySelectorAll('.sidebar, .toc, .pagination-nav, .breadcrumbs, .buttonGroup').forEach(e => e.remove());
                return clone.innerHTML;
            }""")
            
            content_html = self.clean_content(content_html)
            
            return {
                'url': url,
                'title': title.strip(),
                'content': content_html,
                'level': 1
            }
        except Exception as e:
            print(f"   ❌ 失败: {e}")
            return None

    def save_pdf(self):
        import pikepdf
        
        print(f"\n📄 正在生成 PDF（{len(self.pages)} 个章节）...")
        temp_dir = tempfile.mkdtemp(prefix="pdf_pages_")
        
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch()
                temp_files = []
                
                for idx, page_info in enumerate(self.pages):
                    html_file = os.path.join(temp_dir, f"page_{idx:03d}.html")
                    pdf_file = os.path.join(temp_dir, f"page_{idx:03d}.pdf")
                    
                    html_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <style>
        :root {{ --primary: #1a73e8; --text: #202124; --code-bg: #f6f8fa; --border: #dadce0; }}
        body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; 
               line-height: 1.6; color: var(--text); margin: 0; padding: 20px 40px; }}
        .container {{ max-width: 900px; margin: 0 auto; }}
        h1 {{ font-size: 32px; color: var(--primary); border-bottom: 2px solid var(--border); 
              padding-bottom: 12px; margin-bottom: 24px; }}
        h2 {{ font-size: 24px; margin-top: 32px; }}
        h3 {{ font-size: 20px; margin-top: 24px; }}
        h4 {{ font-size: 18px; margin-top: 20px; }}
        pre {{ background: var(--code-bg); padding: 16px; border-radius: 6px; 
               overflow-x: auto; border: 1px solid var(--border); font-size: 13px; }}
        code {{ font-family: monospace; background: rgba(0,0,0,0.05); padding: 2px 6px; border-radius: 3px; }}
        table {{ width: 100%; border-collapse: collapse; margin: 20px 0; font-size: 14px; }}
        th, td {{ border: 1px solid var(--border); padding: 12px; text-align: left; }}
        th {{ background: var(--code-bg); font-weight: 600; }}
        img {{ max-width: 100%; height: auto; }}
        .footer {{ margin-top: 60px; padding-top: 20px; border-top: 1px solid var(--border);
                  text-align: right; font-size: 12px; color: #666; }}
    </style>
</head>
<body>
    <div class="container">
        <h1>{page_info["title"]}</h1>
        <div>{page_info["content"]}</div>
        <div class="footer">来源: {page_info['url']}</div>
    </div>
</body>
</html>"""
                    
                    with open(html_file, "w", encoding="utf-8") as f:
                        f.write(html_content)
                    
                    page = browser.new_page()
                    page.set_viewport_size({"width": 1440, "height": 900})
                    page.goto(f"file://{html_file}", wait_until="networkidle")
                    page.wait_for_timeout(800)
                    
                    page.pdf(
                        path=pdf_file,
                        format="A4",
                        print_background=True,
                        margin={"top": "20mm", "right": "15mm", "bottom": "20mm", "left": "15mm"},
                        display_header_footer=True,
                        header_template='<div style="font-size:9px; width:100%; text-align:center; color:#666;">XCOIN API 文档</div>',
                        footer_template='<div style="font-size:9px; width:100%; text-align:center; color:#666;">第 <span class="pageNumber"></span> 页 / 共 <span class="totalPages"></span> 页</div>'
                    )
                    
                    page.close()
                    os.remove(html_file)
                    temp_files.append(pdf_file)
                    
                    if (idx + 1) % 10 == 0 or idx == len(self.pages) - 1:
                        print(f"   进度: {idx+1}/{len(self.pages)}")
                
                browser.close()
            
            print("🔗 合并 PDF 并生成书签...")
            page_counts = []
            total_pages = 0
            
            for temp_file in temp_files:
                pdf = pikepdf.open(temp_file)
                num_pages = len(pdf.pages)
                page_counts.append({
                    'file': temp_file,
                    'pages': num_pages,
                    'start': total_pages
                })
                total_pages += num_pages
                pdf.close()
            
            merged = pikepdf.Pdf.new()
            with merged.open_outline() as outline:
                for idx, item in enumerate(page_counts):
                    title = self.pages[idx]['title'][:60]
                    outline.root.append(pikepdf.OutlineItem(title, item['start']))
                    
                    src = pikepdf.open(item['file'])
                    merged.pages.extend(src.pages)
                    src.close()
            
            merged.save(self.pdf_file_name)
            merged.close()
            
            print(f"\n✅ 完成！")
            print(f"   总页数: {total_pages}")
            print(f"   章节数: {len(self.pages)}")
            print(f"   文件: {self.pdf_file_name}")
            
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

    def run(self):
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page(viewport={"width": 1440, "height": 900})
            
            print(f"🚀 启动: {self.start_url}")
            page.goto(self.start_url, wait_until="networkidle")
            page.wait_for_timeout(2000)
            
            links = self.extract_sidebar_links(page)
            
            if len(links) > 0:
                print(f"\n🎯 开始爬取 {len(links)} 个页面...")
                for link in links:
                    result = self.crawl_page(page, link['url'], link['title'])
                    if result:
                        result['level'] = link['level']
                        self.pages.append(result)
            else:
                print("⚠️ 未发现链接")
            
            browser.close()
        
        if self.pages:
            self.save_pdf()
        else:
            print("❌ 没有爬取到内容")

if __name__ == "__main__":
    html_url = "https://www.xx.com/xxx/quick-start"
    pdf_file = "xxx.pdf"
    crawler = DocusaurusCrawler(html_url, pdf_file)
    crawler.run()

安装依赖 ​

代码 ​

安装依赖

代码