Files
null-bot/scraper.py

29 lines
1.0 KiB
Python

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator
async def get_clean_content(url: str):
md_generator = DefaultMarkdownGenerator(
options={
"ignore_links": True,
"ignore_images": True,
"body_width": 0,
}
)
browser_conf = BrowserConfig(
# cdp_url="http://127.0.0.1:9222", # Use your existing Chrome session
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
run_conf = CrawlerRunConfig(
cache_mode="bypass",
word_count_threshold=10,
excluded_tags=["nav", "script", "style"],
markdown_generator=md_generator,
delay_before_return_html=3.0,
js_code="window.scrollTo(0, document.body.scrollHeight);",
magic=True
)
async with AsyncWebCrawler(config=browser_conf) as crawler:
result = await crawler.arun(url=url, config=run_conf)
return result.markdown