forked from livekit-examples/python-agents-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_docs.py
More file actions
128 lines (103 loc) · 4.3 KB
/
scrape_docs.py
File metadata and controls
128 lines (103 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
import asyncio
import logging
import re
from pathlib import Path
from typing import List, Set
from urllib.parse import urljoin, urlparse
import aiohttp
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("docs-scraper")
# Load environment variables
load_dotenv()
BASE_URL = "https://docs.livekit.io"
SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
OUTPUT_FILE = Path(__file__).parent / "data/raw_data.txt"
EXCLUDED_PATHS = ["/reference"] # Paths to exclude from scraping
class DocsScraper:
def __init__(self):
self.visited_urls: Set[str] = set()
self.content: List[str] = []
self.session = None
async def init_session(self):
"""Initialize the aiohttp session."""
self.session = aiohttp.ClientSession()
async def close_session(self):
"""Close the aiohttp session."""
if self.session:
await self.session.close()
def should_exclude_url(self, url: str) -> bool:
"""Check if a URL should be excluded from scraping."""
parsed = urlparse(url)
return any(parsed.path.startswith(path) for path in EXCLUDED_PATHS)
async def fetch_sitemap(self) -> List[str]:
"""Fetch and parse the sitemap to get all URLs."""
async with self.session.get(SITEMAP_URL) as response:
if response.status != 200:
raise Exception(f"Failed to fetch sitemap: {response.status}")
content = await response.text()
soup = BeautifulSoup(content, "xml")
urls = [loc.text for loc in soup.find_all("loc")]
# Filter out excluded URLs and ensure they're from docs.livekit.io
return [
url for url in urls
if url.startswith(BASE_URL) and not self.should_exclude_url(url)
]
async def fetch_page(self, url: str) -> str:
"""Fetch a single page and extract its content."""
try:
async with self.session.get(url) as response:
if response.status != 200:
logger.warning(f"Failed to fetch {url}: {response.status}")
return ""
content = await response.text()
soup = BeautifulSoup(content, "html.parser")
# Extract the main content
main_content = soup.find("main")
if not main_content:
return ""
# Remove unwanted elements
for element in main_content.find_all(["nav", "footer", "header", "script", "style"]):
element.decompose()
# Clean up the text
text = main_content.get_text(separator="\n", strip=True)
text = re.sub(r"\n\s*\n", "\n\n", text) # Remove excessive newlines
return text.strip()
except Exception as e:
logger.error(f"Error fetching {url}: {e}")
return ""
async def scrape(self):
"""Main scraping function."""
await self.init_session()
try:
# Get all URLs from sitemap
urls = await self.fetch_sitemap()
logger.info(f"Found {len(urls)} URLs to scrape")
# Process each URL
for url in urls:
if url in self.visited_urls:
continue
self.visited_urls.add(url)
logger.info(f"Scraping {url}")
content = await self.fetch_page(url)
if content:
self.content.append(f"Content from {url}:\n\n{content}\n\n")
finally:
await self.close_session()
def save_content(self):
"""Save the scraped content to a file."""
with open(OUTPUT_FILE, "w") as f:
f.write("\n".join(self.content))
logger.info(f"Saved content to {OUTPUT_FILE}")
async def main():
"""Main function to run the scraper."""
scraper = DocsScraper()
await scraper.scrape()
scraper.save_content()
if __name__ == "__main__":
asyncio.run(main())