安装依赖
bash
pip install playwright pikepdf
playwright install chromium代码
python
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin, urlparse
import os
import re
import tempfile
import shutil
class DocusaurusCrawler:
def __init__(self, start_url, pdf_file_name):
self.start_url = start_url
self.pdf_file_name = pdf_file_name
self.base_domain = urlparse(start_url).netloc
self.visited_urls = set()
self.pages = []
def close_blocking_overlays(self, page):
"""关闭可能遮挡点击的浮层、遮罩、弹窗等"""
page.evaluate("""() => {
// 关闭所有可能的浮层和遮罩
const selectors = [
'[data-floating-ui-portal]',
'[role="dialog"]',
'[role="alertdialog"]',
'.modal',
'.overlay',
'.backdrop',
'.toast',
'.notification',
// 常见的遮罩层 class
'.fixed.inset-0',
'.absolute.inset-0',
'div[class*="mask"]',
'div[class*="overlay"]',
'div[class*="backdrop"]',
'div[class*="portal"]'
];
selectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
// 方法1: 移除元素
el.remove();
// 方法2: 隐藏元素
// el.style.display = 'none';
// el.style.visibility = 'hidden';
// el.style.pointerEvents = 'none';
});
});
// 确保 body 可点击
document.body.style.pointerEvents = 'auto';
document.body.style.overflow = 'auto';
}""")
# 按 ESC 键关闭可能的弹窗
page.keyboard.press('Escape')
page.wait_for_timeout(300)
def expand_all_menus(self, page):
"""通过 JavaScript 强制点击展开所有菜单(绕过浮层遮挡)"""
print(" 🔓 展开所有折叠菜单...")
max_attempts = 25
for attempt in range(max_attempts):
# 先关闭可能的遮挡浮层
self.close_blocking_overlays(page)
# 获取当前折叠项数量
collapsed_count = page.evaluate("""() => {
return document.querySelectorAll('.menu__list-item--collapsed').length;
}""")
if collapsed_count == 0:
print(f" ✅ 所有菜单已展开({attempt} 次尝试)")
break
print(f" 第 {attempt+1} 次:还有 {collapsed_count} 个折叠项")
# 使用 JS 点击(绕过 Playwright 的点击检查)
clicked = page.evaluate("""() => {
const item = document.querySelector('.menu__list-item--collapsed');
if (!item) return false;
// 找到可点击的元素
const clickable = item.querySelector('.menu__link--sublist, .menu__caret, button');
if (!clickable) {
// 如果没有特定点击元素,直接移除折叠标记
item.classList.remove('menu__list-item--collapsed');
return true;
}
// 触发点击事件
const events = ['mousedown', 'click', 'mouseup'];
events.forEach(type => {
const evt = new MouseEvent(type, {
bubbles: true,
cancelable: true,
view: window
});
clickable.dispatchEvent(evt);
});
// 同时直接修改 class
item.classList.remove('menu__list-item--collapsed');
// 显示子菜单
const submenu = item.querySelector('.menu__list');
if (submenu) {
submenu.style.display = 'block';
submenu.style.height = 'auto';
}
return true;
}""")
if not clicked:
print(" ⚠️ 无法点击,尝试备用方案...")
break
# 等待 React 响应
page.wait_for_timeout(800)
# 最终强制展开所有剩余项
final_count = page.evaluate("""() => {
const items = document.querySelectorAll('.menu__list-item--collapsed');
const count = items.length;
items.forEach(item => {
item.classList.remove('menu__list-item--collapsed');
const submenu = item.querySelector('.menu__list');
if (submenu) {
submenu.style.display = 'block';
submenu.style.height = 'auto';
}
});
// 展开所有子列表
document.querySelectorAll('.menu__link--sublist').forEach(link => {
link.setAttribute('aria-expanded', 'true');
});
document.querySelectorAll('.menu__list').forEach(list => {
list.style.display = 'block';
});
return count;
}""")
if final_count > 0:
print(f" 强制展开了 {final_count} 个剩余项")
page.wait_for_timeout(1000)
def extract_sidebar_links(self, page):
"""提取侧边栏所有链接"""
print("📑 提取导航目录...")
page.wait_for_selector(".theme-doc-sidebar-menu", timeout=15000)
page.wait_for_timeout(1000)
# 先关闭浮层再展开菜单
self.close_blocking_overlays(page)
self.expand_all_menus(page)
# 提取所有链接
current_url = page.url.rstrip('/')
links = page.evaluate("""(currentUrl) => {
const links = [];
const seen = new Set();
document.querySelectorAll('.theme-doc-sidebar-menu a.menu__link[href]').forEach(link => {
const url = link.href?.replace(/\\/$/, '');
const title = link.textContent?.trim();
if (!url || !title || url.includes('#')) return;
// 计算层级深度
let level = 1;
let element = link;
while (element) {
if (element.classList?.contains('menu__list-item')) {
const parent = element.parentElement?.closest('.menu__list-item');
if (parent) level++;
}
element = element.parentElement;
}
if (!seen.has(url)) {
seen.add(url);
links.push({ title, url, level: Math.min(level, 4) });
}
});
if (!seen.has(currentUrl)) {
const h1 = document.querySelector('h1')?.textContent?.trim() || '当前页面';
links.unshift({ title: h1, url: currentUrl, level: 1 });
}
return links;
}""", current_url)
# 去重
seen = set()
unique_links = []
for link in links:
clean_url = link['url'].rstrip('/').split('#')[0]
if clean_url not in seen and self.base_domain in clean_url and '/docs/' in clean_url:
seen.add(clean_url)
unique_links.append(link)
print(f"✅ 共发现 {len(unique_links)} 个页面")
# 分组显示
current_section = ""
section_num = 0
for i, link in enumerate(unique_links[:80]):
if link['level'] == 1:
section_num += 1
current_section = link['title']
print(f"\n 【{section_num}】{link['title']}")
else:
indent = " " + " " * (link['level'] - 2)
print(f"{indent}[{i+1}] {link['title'][:40]}")
if len(unique_links) > 80:
print(f"\n ... 还有 {len(unique_links) - 80} 个页面")
return unique_links
def clean_content(self, html):
html = re.sub(r'<a[^>]*aria-label="[^"]*Edit[^"]*"[^>]*>.*?</a>', '', html, flags=re.DOTALL)
html = re.sub(r'<div[^>]*class="[^"]*last-updated[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
html = re.sub(r'<nav[^>]*class="[^"]*pagination-nav[^"]*"[^>]*>.*?</nav>', '', html, flags=re.DOTALL)
html = re.sub(r'<aside[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</aside>', '', html, flags=re.DOTALL)
html = re.sub(r'<nav[^>]*class="[^"]*breadcrumbs[^"]*"[^>]*>.*?</nav>', '', html, flags=re.DOTALL)
html = re.sub(r'<div[^>]*class="[^"]*theme-doc-breadcrumbs[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
html = re.sub(r'<div[^>]*class="[^"]*toc[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
html = re.sub(r'<div[^>]*class="[^"]*buttonGroup[^"]*"[^>]*>.*?</div>', '', html, flags=re.DOTALL)
html = re.sub(r'<h1[^>]*>.*?</h1>', '', html, flags=re.DOTALL)
return html
def crawl_page(self, page, url, title_hint=None):
if url in self.visited_urls:
return None
self.visited_urls.add(url)
short_title = title_hint or url.split('/')[-1] or "页面"
print(f"🌐 [{len(self.visited_urls)}] {short_title[:50]}...")
try:
page.goto(url, wait_until="networkidle", timeout=30000)
page.wait_for_timeout(1500)
page.wait_for_selector(".theme-doc-markdown, article", timeout=10000)
try:
title = page.inner_text('h1', timeout=5000)
except:
title = title_hint or "无标题"
content_html = page.evaluate("""() => {
let el = document.querySelector('.theme-doc-markdown');
if (!el) el = document.querySelector('article.markdown');
if (!el) el = document.querySelector('main article');
if (!el) return '';
const clone = el.cloneNode(true);
clone.querySelectorAll('.sidebar, .toc, .pagination-nav, .breadcrumbs, .buttonGroup').forEach(e => e.remove());
return clone.innerHTML;
}""")
content_html = self.clean_content(content_html)
return {
'url': url,
'title': title.strip(),
'content': content_html,
'level': 1
}
except Exception as e:
print(f" ❌ 失败: {e}")
return None
def save_pdf(self):
import pikepdf
print(f"\n📄 正在生成 PDF({len(self.pages)} 个章节)...")
temp_dir = tempfile.mkdtemp(prefix="pdf_pages_")
try:
with sync_playwright() as p:
browser = p.chromium.launch()
temp_files = []
for idx, page_info in enumerate(self.pages):
html_file = os.path.join(temp_dir, f"page_{idx:03d}.html")
pdf_file = os.path.join(temp_dir, f"page_{idx:03d}.pdf")
html_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<style>
:root {{ --primary: #1a73e8; --text: #202124; --code-bg: #f6f8fa; --border: #dadce0; }}
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
line-height: 1.6; color: var(--text); margin: 0; padding: 20px 40px; }}
.container {{ max-width: 900px; margin: 0 auto; }}
h1 {{ font-size: 32px; color: var(--primary); border-bottom: 2px solid var(--border);
padding-bottom: 12px; margin-bottom: 24px; }}
h2 {{ font-size: 24px; margin-top: 32px; }}
h3 {{ font-size: 20px; margin-top: 24px; }}
h4 {{ font-size: 18px; margin-top: 20px; }}
pre {{ background: var(--code-bg); padding: 16px; border-radius: 6px;
overflow-x: auto; border: 1px solid var(--border); font-size: 13px; }}
code {{ font-family: monospace; background: rgba(0,0,0,0.05); padding: 2px 6px; border-radius: 3px; }}
table {{ width: 100%; border-collapse: collapse; margin: 20px 0; font-size: 14px; }}
th, td {{ border: 1px solid var(--border); padding: 12px; text-align: left; }}
th {{ background: var(--code-bg); font-weight: 600; }}
img {{ max-width: 100%; height: auto; }}
.footer {{ margin-top: 60px; padding-top: 20px; border-top: 1px solid var(--border);
text-align: right; font-size: 12px; color: #666; }}
</style>
</head>
<body>
<div class="container">
<h1>{page_info["title"]}</h1>
<div>{page_info["content"]}</div>
<div class="footer">来源: {page_info['url']}</div>
</div>
</body>
</html>"""
with open(html_file, "w", encoding="utf-8") as f:
f.write(html_content)
page = browser.new_page()
page.set_viewport_size({"width": 1440, "height": 900})
page.goto(f"file://{html_file}", wait_until="networkidle")
page.wait_for_timeout(800)
page.pdf(
path=pdf_file,
format="A4",
print_background=True,
margin={"top": "20mm", "right": "15mm", "bottom": "20mm", "left": "15mm"},
display_header_footer=True,
header_template='<div style="font-size:9px; width:100%; text-align:center; color:#666;">XCOIN API 文档</div>',
footer_template='<div style="font-size:9px; width:100%; text-align:center; color:#666;">第 <span class="pageNumber"></span> 页 / 共 <span class="totalPages"></span> 页</div>'
)
page.close()
os.remove(html_file)
temp_files.append(pdf_file)
if (idx + 1) % 10 == 0 or idx == len(self.pages) - 1:
print(f" 进度: {idx+1}/{len(self.pages)}")
browser.close()
print("🔗 合并 PDF 并生成书签...")
page_counts = []
total_pages = 0
for temp_file in temp_files:
pdf = pikepdf.open(temp_file)
num_pages = len(pdf.pages)
page_counts.append({
'file': temp_file,
'pages': num_pages,
'start': total_pages
})
total_pages += num_pages
pdf.close()
merged = pikepdf.Pdf.new()
with merged.open_outline() as outline:
for idx, item in enumerate(page_counts):
title = self.pages[idx]['title'][:60]
outline.root.append(pikepdf.OutlineItem(title, item['start']))
src = pikepdf.open(item['file'])
merged.pages.extend(src.pages)
src.close()
merged.save(self.pdf_file_name)
merged.close()
print(f"\n✅ 完成!")
print(f" 总页数: {total_pages}")
print(f" 章节数: {len(self.pages)}")
print(f" 文件: {self.pdf_file_name}")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
def run(self):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(viewport={"width": 1440, "height": 900})
print(f"🚀 启动: {self.start_url}")
page.goto(self.start_url, wait_until="networkidle")
page.wait_for_timeout(2000)
links = self.extract_sidebar_links(page)
if len(links) > 0:
print(f"\n🎯 开始爬取 {len(links)} 个页面...")
for link in links:
result = self.crawl_page(page, link['url'], link['title'])
if result:
result['level'] = link['level']
self.pages.append(result)
else:
print("⚠️ 未发现链接")
browser.close()
if self.pages:
self.save_pdf()
else:
print("❌ 没有爬取到内容")
if __name__ == "__main__":
html_url = "https://www.xx.com/xxx/quick-start"
pdf_file = "xxx.pdf"
crawler = DocusaurusCrawler(html_url, pdf_file)
crawler.run()