first commit
This commit is contained in:
29
scraper.py
Normal file
29
scraper.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
|
||||
async def get_clean_content(url: str):
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
options={
|
||||
"ignore_links": True,
|
||||
"ignore_images": True,
|
||||
"body_width": 0,
|
||||
}
|
||||
)
|
||||
|
||||
browser_conf = BrowserConfig(
|
||||
# cdp_url="http://127.0.0.1:9222", # Use your existing Chrome session
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
run_conf = CrawlerRunConfig(
|
||||
cache_mode="bypass",
|
||||
word_count_threshold=10,
|
||||
excluded_tags=["nav", "script", "style"],
|
||||
markdown_generator=md_generator,
|
||||
delay_before_return_html=3.0,
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
magic=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
||||
result = await crawler.arun(url=url, config=run_conf)
|
||||
return result.markdown
|
||||
Reference in New Issue
Block a user