29 lines
1.0 KiB
Python
29 lines
1.0 KiB
Python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator
|
|
|
|
async def get_clean_content(url: str):
|
|
md_generator = DefaultMarkdownGenerator(
|
|
options={
|
|
"ignore_links": True,
|
|
"ignore_images": True,
|
|
"body_width": 0,
|
|
}
|
|
)
|
|
|
|
browser_conf = BrowserConfig(
|
|
# cdp_url="http://127.0.0.1:9222", # Use your existing Chrome session
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
run_conf = CrawlerRunConfig(
|
|
cache_mode="bypass",
|
|
word_count_threshold=10,
|
|
excluded_tags=["nav", "script", "style"],
|
|
markdown_generator=md_generator,
|
|
delay_before_return_html=3.0,
|
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
|
magic=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
|
result = await crawler.arun(url=url, config=run_conf)
|
|
return result.markdown |