forked from unclecode/crawl4ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrowser_optimization_example.py
128 lines (106 loc) · 4.05 KB
/
browser_optimization_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
This example demonstrates optimal browser usage patterns in Crawl4AI:
1. Sequential crawling with session reuse
2. Parallel crawling with browser instance reuse
3. Performance optimization settings
"""
import asyncio
import os
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def crawl_sequential(urls: List[str]):
"""
Sequential crawling using session reuse - most efficient for moderate workloads
"""
print("\n=== Sequential Crawling with Session Reuse ===")
# Configure browser with optimized settings
browser_config = BrowserConfig(
headless=True,
browser_args=[
"--disable-gpu", # Disable GPU acceleration
"--disable-dev-shm-usage", # Disable /dev/shm usage
"--no-sandbox", # Required for Docker
],
viewport={
"width": 800,
"height": 600,
}, # Smaller viewport for better performance
)
# Configure crawl settings
crawl_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(), In case you need fit_markdown
),
)
# Create single crawler instance
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
session_id = "session1" # Use same session for all URLs
for url in urls:
result = await crawler.arun(
url=url,
config=crawl_config,
session_id=session_id, # Reuse same browser tab
)
if result.success:
print(f"Successfully crawled {url}")
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
finally:
await crawler.close()
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
"""
Parallel crawling while reusing browser instance - best for large workloads
"""
print("\n=== Parallel Crawling with Browser Reuse ===")
browser_config = BrowserConfig(
headless=True,
browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
viewport={"width": 800, "height": 600},
)
crawl_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(), In case you need fit_markdown
),
)
# Create single crawler instance for all parallel tasks
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
# Create tasks in batches to control concurrency
for i in range(0, len(urls), max_concurrent):
batch = urls[i : i + max_concurrent]
tasks = []
for j, url in enumerate(batch):
session_id = (
f"parallel_session_{j}" # Different session per concurrent task
)
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
tasks.append(task)
# Wait for batch to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
for url, result in zip(batch, results):
if isinstance(result, Exception):
print(f"Error crawling {url}: {str(result)}")
elif result.success:
print(f"Successfully crawled {url}")
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
finally:
await crawler.close()
async def main():
# Example URLs
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
]
# Demo sequential crawling
await crawl_sequential(urls)
# Demo parallel crawling
await crawl_parallel(urls, max_concurrent=2)
if __name__ == "__main__":
asyncio.run(main())