diff --git a/__pycache__/database.cpython-312.pyc b/__pycache__/database.cpython-312.pyc index db655c2..f9c3d3c 100644 Binary files a/__pycache__/database.cpython-312.pyc and b/__pycache__/database.cpython-312.pyc differ diff --git a/__pycache__/scraper.cpython-312.pyc b/__pycache__/scraper.cpython-312.pyc index 0995313..f99504b 100644 Binary files a/__pycache__/scraper.cpython-312.pyc and b/__pycache__/scraper.cpython-312.pyc differ diff --git a/bot.py b/bot.py index 721ea48..1ece4eb 100644 --- a/bot.py +++ b/bot.py @@ -1,5 +1,8 @@ import os import asyncio +import sys +import concurrent.futures + import logging from contextlib import suppress from dotenv import load_dotenv @@ -10,7 +13,28 @@ from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, fil # Import your existing logic from agent import parse_page from database import upload_entry -from scraper import get_clean_content +from scraper import get_clean_content as _get_clean_content + + +# Run the scraper in a separate thread with its own event loop to avoid +# Windows Selector vs Proactor event loop conflicts between PTB and Playwright. +def _run_scraper_in_thread(url: str) -> str: + # Proactor is required for subprocesses (Playwright) on Windows + if sys.platform == 'win32': + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(_get_clean_content(url)) + finally: + loop.close() + + +async def get_clean_content(url: str) -> str: + loop = asyncio.get_event_loop() + with concurrent.futures.ThreadPoolExecutor() as pool: + result = await loop.run_in_executor(pool, _run_scraper_in_thread, url) + return result load_dotenv() logging.getLogger("httpx").setLevel(logging.WARNING) @@ -295,6 +319,6 @@ async def _main(): if __name__ == '__main__': import sys - if sys.platform == 'win32': - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + # if sys.platform == 'win32': + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) asyncio.run(_main()) \ No newline at end of file diff --git a/database.py b/database.py index 58576f8..558d56c 100644 --- a/database.py +++ b/database.py @@ -8,6 +8,7 @@ load_dotenv() pb = PocketBase(os.getenv('POCKETBASE_URL')) admin_data = pb.admins.auth_with_password(os.getenv('POCKETBASE_ADMIN_EMAIL'), os.getenv('POCKETBASE_ADMIN_PASSWORD')) +show_debug_msg = False def convert_datetime_to_pocketbase(date_time_str): """ @@ -18,7 +19,8 @@ def convert_datetime_to_pocketbase(date_time_str): return None try: - print(f"[DEBUG] Converting datetime: '{date_time_str}' (type: {type(date_time_str)})") + if show_debug_msg: + print(f"[DEBUG] Converting datetime: '{date_time_str}' (type: {type(date_time_str)})") # Parse the input format: "DD-MM-YYYY HH:MM" or "DD-MM-YYYY (HH:MM)" date_time_str = date_time_str.replace("(", "").replace(")", "").strip() @@ -32,7 +34,8 @@ def convert_datetime_to_pocketbase(date_time_str): # Convert to PocketBase local datetime format: YYYY-MM-DD HH:MM:SS pb_format = dt.strftime("%Y-%m-%d %H:%M:%S") - print(f"[DEBUG] Converted to PocketBase format: '{pb_format}'") + if show_debug_msg: + print(f"[DEBUG] Converted to PocketBase format: '{pb_format}'") return pb_format except Exception as e: print(f"[ERROR] Error converting datetime '{date_time_str}': {e}") @@ -49,7 +52,7 @@ def upload_entry(data, entry_type='opportunity', url=None): entry_type: 'opportunity' or 'event' url: The source URL of the entry """ - print(f"[DEBUG] Uploading {entry_type} entry. Data: {data}") + print(f"[DEBUG] Uploading {entry_type} entry. Data: {data["title"]}") data = dict(data) # Add URL to data if provided @@ -66,7 +69,8 @@ def upload_entry(data, entry_type='opportunity', url=None): data['datetime'] = convert_datetime_to_pocketbase(data['date_time']) # Remove the original field since PocketBase expects 'datetime' del data['date_time'] - print(f"[DEBUG] Event datetime: '{original_dt}' -> '{data['datetime']}'") + if show_debug_msg: + print(f"[DEBUG] Event datetime: '{original_dt}' -> '{data['datetime']}'") else: print(f"[WARNING] No 'date_time' field found in event data") @@ -81,7 +85,8 @@ def upload_entry(data, entry_type='opportunity', url=None): original_deadline = data['deadline'] # Convert deadline to PocketBase datetime format data['deadline'] = convert_datetime_to_pocketbase(data['deadline']) - print(f"[DEBUG] Opportunity deadline: '{original_deadline}' -> '{data['deadline']}'") + if show_debug_msg: + print(f"[DEBUG] Opportunity deadline: '{original_deadline}' -> '{data['deadline']}'") else: print(f"[WARNING] No 'deadline' field found in opportunity data") diff --git a/scraper.py b/scraper.py index 69a026e..76801b2 100644 --- a/scraper.py +++ b/scraper.py @@ -23,7 +23,7 @@ async def get_clean_content(url: str): js_code="window.scrollTo(0, document.body.scrollHeight);", magic=True ) - + async with AsyncWebCrawler(config=browser_conf) as crawler: result = await crawler.arun(url=url, config=run_conf) return result.markdown \ No newline at end of file