first commit

2026-05-10 13:14:14 +01:00
commit 4726582379
14 changed files with 854 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 .env
--- a/README.md
+++ b/README.md
@@ -0,0 +1,115 @@
 # null-bot
 > A small Telegram bot for extracting and saving opportunities and events from web pages or pasted text. Uses an LLM agent to parse content into structured JSON and stores entries in a local PocketBase instance. This bot uses all open-source tools. The LLM of choice is granite4.1:8b by IBM under their Apache 2.0 License. 
 ## Features
 - Parse Opportunity (`/op`) and Event (`/ev`) entries from a URL or pasted text
 - Two entry types with separate system prompts and JSON schemas (externalized to `prompts.py`)
 - Follow-up prompt when users paste text: ask for a source URL only when saving
 - Converts date/time to PocketBase-friendly format (`YYYY-MM-DD HH:MM:SS`)
 - Retry decorator for robust LLM / network calls
 ## Requirements
 - Python 3.11+ recommended
 - See `requirements.txt` for full dependency list
 ## Setup
 1. Clone the repo or copy files to your machine.
 2. Create and activate a Python virtual environment:
 ```bash
 python -m venv .venv
 # Windows
 .venv\Scripts\activate
 # macOS / Linux
 source .venv/bin/activate
 ```
 3. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 4. Environment variables
 - Create a `.env` file in the project root with at minimum:
 ```
 TG_TOKEN=your_telegram_bot_token_here
 OLLAMA_BASE_URL=http://localhost:11434/v1
 ALLOWED_USERS=1234,5678
 POCKETBASE_URL=http://127.0.0.1:8090
 POCKETBASE_ADMIN_EMAIL=admin@example.com
 POCKETBASE_ADMIN_PASSWORD=secret
 ```
 - Notes:
  - `ALLOWED_USERS` should be a comma-separated list of Telegram user IDs (no brackets).
  - The bot reads `TG_TOKEN` and `ALLOWED_USERS` from the environment.
 6. Ollama (local LLM) setup
 - This project uses a local Ollama instance (or any compatible local LLM HTTP API) as the LLM provider. The bot expects an HTTP endpoint available at `OLLAMA_BASE_URL` (default `http://localhost:11434/v1`). 
 - Quick steps to get Ollama running locally:
  1. Install Ollama for your platform — follow the official instructions: https://ollama.com/docs (or use the native installer for Windows/macOS/Linux).
  2. Pull or install a model you want to use. Example (CLI):
  ```bash
  ollama pull granite4.1:8b
  ```
  3. Start the Ollama daemon / HTTP API so the bot can reach it. Depending on your Ollama installation this may be:
  ```bash
  # example commands — consult your Ollama docs if these differ
  ollama serve
  # or
  ollama daemon
  ```
  4. Set `OLLAMA_BASE_URL` in your `.env` to point to the running API, for example:
  ```text
  OLLAMA_BASE_URL=http://localhost:11434/v1
  ```
  5. Verify the API is reachable (example curl):
  ```bash
  curl -s -X POST "${OLLAMA_BASE_URL}/completions" \
    -H "Content-Type: application/json" \
    -d '{"model":"<model-name>","prompt":"hello","max_tokens":16}'
  ```
  A successful response indicates your Ollama HTTP API is reachable and can serve model requests.
 - Notes and troubleshooting
  - If your Ollama installation exposes a different port or path, update `OLLAMA_BASE_URL` accordingly.
  - If you prefer hosted LLMs (OpenAI, Anthropic, Cohere, etc.), `agent.py` can be adapted to use other providers; ensure the provider client is configured and the prompts in `prompts.py` are compatible.
 ## Running the bot
 Start the bot with the project's entrypoint (example):
 ```bash
 python bot.py
 ```
 The bot listens for commands:
 - `/op <url or paste>` — parse an opportunity
 - `/ev <url or paste>` — parse an event
 If you paste text (instead of sending a URL), the bot will parse it and when you click Save it will prompt you for a source URL (or you can `/skip`).
 ## How it works (high-level)
 - `agent.py` uses `pydantic-ai` + a local LLM provider (e.g. Ollama) and system prompts from `prompts.py` to parse pages/text into structured JSON.
 - `database.py` converts datetime fields and uploads the entry to the appropriate PocketBase collection (`events` or `opportunities`).
 - `bot.py` handles Telegram interactions, queues parse tasks, and preserves per-user state in `context.user_data`.
 ## Troubleshooting
 - If dates show as `None` after save: verify PocketBase field names (`datetime` for events, `deadline` for opportunities) and ensure `.env` is configured.
 - If the bot doesn't start: check `TG_TOKEN` is present and valid.
 - If parsing fails or you see unexpected behavior, check logs printed to the console for `convert_datetime_to_pocketbase()` and `upload_entry()` debug messages.
--- a/pycache/agent.cpython-312.pyc
+++ b/pycache/agent.cpython-312.pyc
--- a/pycache/database.cpython-312.pyc
+++ b/pycache/database.cpython-312.pyc
--- a/pycache/prompts.cpython-312.pyc
+++ b/pycache/prompts.cpython-312.pyc
--- a/pycache/schemas.cpython-312.pyc
+++ b/pycache/schemas.cpython-312.pyc
--- a/pycache/scraper.cpython-312.pyc
+++ b/pycache/scraper.cpython-312.pyc
--- a/agent.py
+++ b/agent.py
@@ -0,0 +1,68 @@
 from pydantic_ai import Agent
 from pydantic_ai.models.ollama import OllamaModel
 from pydantic_ai.providers.ollama import OllamaProvider
 from dotenv import load_dotenv
 import os
 import json
 from prompts import OPPORTUNITY_PROMPT, EVENT_PROMPT
 load_dotenv()
 ollama_url = os.getenv("OLLAMA_BASE_URL")
 prov = OllamaProvider(base_url=ollama_url)
 # Use qwen2.5:3b or phi4-mini for low-end hardware (RAM < 8GB)
 model = OllamaModel(
    model_name='granite4.1:8b',
    provider=prov
 )
 # --- OPPORTUNITY AGENT ---
 opportunity_agent = Agent(
    model,
    output_type=str,
    system_prompt=OPPORTUNITY_PROMPT,
    retries=5
 )
 # --- EVENT AGENT ---
 event_agent = Agent(
    model,
    output_type=str,
    system_prompt=EVENT_PROMPT,
    retries=5
 )
 async def parse_page(content: str, entry_type: str = "opportunity"):
    """
    Parse content and extract entry data based on type.
    Args:
        content: The raw text content to parse
        entry_type: Either 'opportunity' or 'event'
    """
    # Select the appropriate agent
    agent = opportunity_agent if entry_type == "opportunity" else event_agent
    # 1. Run the agent (which returns a string)
    print(f"Parsing {entry_type}...")
    print(content)
    result = await agent.run(content)
    raw_text = result.output
    print(raw_text) 
    # 2. Clean the string
    # We remove the markdown decorators so json.loads doesn't crash
    clean_json = raw_text.replace("```json", "").replace("```", "").strip()
    try:
        # 3. Convert string to a dictionary
        data_dict = json.loads(clean_json)
        # 4. Success! return the dictionary to main.py
        return data_dict
    except json.JSONDecodeError as e:
        print(f"Critical Error: The AI sent invalid JSON. Text was: {raw_text}")
        raise e
--- a/bot.py
+++ b/bot.py
@@ -0,0 +1,279 @@
 import os
 import asyncio
 import logging
 from dotenv import load_dotenv
 from functools import wraps
 from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
 from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, filters, ContextTypes, CallbackQueryHandler
 # Import your existing logic
 from agent import parse_page
 from database import upload_entry
 from scraper import get_clean_content
 load_dotenv()
 # Configuration
 TOKEN = os.getenv("TG_TOKEN")
 _allowed_env = os.getenv("ALLOWED_USERS", "")
 if _allowed_env:
    try:
        ALLOWED_IDS = [int(x.strip()) for x in _allowed_env.split(',') if x.strip()]
    except Exception:
        logging.warning("Failed to parse ALLOWED_USERS from .env; defaulting to empty list")
        ALLOWED_IDS = []
 else:
    ALLOWED_IDS = []
 if not TOKEN:
    logging.warning("TG_TOKEN not set in .env; bot will not start without a token")
 # Setup Logging
 logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
 # --- Retry Function with Exponential Backoff ---
 def retry(max_attempts=3, backoff_factor=2, initial_delay=1):
    """
    Decorator for retrying async functions with exponential backoff.
    Args:
        max_attempts: Maximum number of retry attempts
        backoff_factor: Multiplier for delay between retries
        initial_delay: Initial delay in seconds
    """
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            delay = initial_delay
            last_exception = None
            for attempt in range(1, max_attempts + 1):
                try:
                    return await func(*args, **kwargs)
                except Exception as e:
                    last_exception = e
                    if attempt < max_attempts:
                        logging.warning(f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: {str(e)}")
                        await asyncio.sleep(delay)
                        delay *= backoff_factor
                    else:
                        logging.error(f"All {max_attempts} attempts failed for {func.__name__}")
            raise last_exception
        return wrapper
    return decorator
 # --- The Queue System ---
 # This ensures only ONE crawl/parse happens at a time to save RAM
 task_queue = asyncio.Queue()
 def build_entry_summary(data, entry_type, saved=False):
    if entry_type == "event":
        event_datetime = data.get('date_time') or data.get('datetime')
        return (
            f"✅ **{data.get('title', 'Unknown')}**\n"
            f"🦆 Org/s: {data.get('org')}\n"
            f"📅 Date & Time: {event_datetime}\n"
            f"📍 Location: {data.get('location')}\n"
            f"🐊 Summary: {data.get('summary')}"
            + ("\n\n💾 **Saved to PocketBase!**" if saved else "")
        )
    return (
        f"✅ **{data.get('title', 'Unknown')}**\n"
        f"🦆 Org/s: {data.get('org')}\n"
        f"📋 Type: {data.get('type')}\n"
        f"🦢 Deadline: {data.get('deadline')}\n"
        f"☁️ Location: {data.get('location')}\n"
        f"🐊 Summary: {data.get('summary')}"
        + ("\n\n💾 **Saved to PocketBase!**" if saved else "")
    )
 async def worker():
    while True:
        # Get a 'task' from the queue
        update, context, source_value, entry_type, source_kind = await task_queue.get()
        try:
            await process_link(update, context, source_value, entry_type, source_kind)
        finally:
            task_queue.task_done()
 async def process_link(update, context, source_value, entry_type="opportunity", source_kind="url"):
    # Handle both message and callback query contexts
    if update.message:
        status_msg = await update.message.reply_text(f"⏳ Crawling & Analyzing...")
    elif update.callback_query:
        status_msg = update.callback_query.message
        await status_msg.edit_text(f"⏳ Crawling & Analyzing...")
    else:
        logging.error("Could not determine message context for status update")
        return
    # Store source and type for potential retry/save
    context.user_data['last_source_value'] = source_value
    context.user_data['last_source_kind'] = source_kind
    context.user_data['last_entry_type'] = entry_type
    try:
        if source_kind == "url":
            markdown = await get_clean_content(source_value)
        else:
            markdown = source_value
        extracted_data = await parse_page(markdown, entry_type) 
        if not extracted_data:
            await status_msg.edit_text("❌ Failed to extract data from that page.")
            return
        # 2. Store data temporarily in context to 'Save' later
        context.user_data['last_extracted'] = extracted_data
        context.user_data['awaiting_save_url'] = False
        context.user_data['pending_save_url'] = None
        # 3. Show Result with Buttons - format depends on entry type
        summary = build_entry_summary(extracted_data, entry_type)
        keyboard = [
            [InlineKeyboardButton("💾 Save to DB", callback_data='save_db')],
            [InlineKeyboardButton("🗑️ Discard", callback_data='discard')],
            [InlineKeyboardButton("🔄️ Retry", callback_data='retry')]
        ]
        reply_markup = InlineKeyboardMarkup(keyboard)
        await status_msg.edit_text(summary, reply_markup=reply_markup, parse_mode='Markdown')
    except Exception as e:
        await status_msg.edit_text(f"⚠️ Error: {str(e)}")
 # --- Handlers ---
 async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS:
        return
    await update.message.reply_text(
        "Welcome! I can extract arts opportunities and events.\n\n"
        "📋 **Commands:**\n"
        "/op <url> - Extract an opportunity\n"
        "/ev <url> - Extract an event"
    )
 async def handle_opportunity(update: Update, context: ContextTypes.DEFAULT_TYPE):
    user_id = update.effective_user.id
    if user_id not in ALLOWED_IDS:
        await update.message.reply_text("Unauthorized. User ID needs to be added!")
        return
    if not context.args:
        await update.message.reply_text("Please provide a URL or paste text. Usage: /op <url or text>")
        return
    input_text = " ".join(context.args).strip()
    if not input_text:
        await update.message.reply_text("Please provide a URL or paste text. Usage: /op <url or text>")
        return
    source_kind = "url" if input_text.startswith("http") else "text"
    await update.message.reply_text("📥 Link queued for processing...")
    await task_queue.put((update, context, input_text, "opportunity", source_kind))
 async def handle_event(update: Update, context: ContextTypes.DEFAULT_TYPE):
    user_id = update.effective_user.id
    if user_id not in ALLOWED_IDS:
        await update.message.reply_text("Unauthorized. User ID needs to be added!")
        return
    if not context.args:
        await update.message.reply_text("Please provide a URL or paste text. Usage: /ev <url or text>")
        return
    input_text = " ".join(context.args).strip()
    if not input_text:
        await update.message.reply_text("Please provide a URL or paste text. Usage: /ev <url or text>")
        return
    source_kind = "url" if input_text.startswith("http") else "text"
    await update.message.reply_text("📥 Link queued for processing...")
    await task_queue.put((update, context, input_text, "event", source_kind))
 async def handle_followup_text(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS:
        return
    if not context.user_data.get('awaiting_save_url'):
        return
    text = update.message.text.strip()
    if not text:
        await update.message.reply_text("Please send a URL or type /skip to save without one.")
        return
    if text.lower() == '/skip':
        url = None
    elif text.startswith('http'):
        url = text
    else:
        await update.message.reply_text("Please send a valid URL or type /skip to save without one.")
        return
    data = context.user_data.get('last_extracted')
    entry_type = context.user_data.get('last_entry_type', 'opportunity')
    if data:
        upload_entry(data, entry_type, url)
        context.user_data['awaiting_save_url'] = False
        context.user_data['pending_save_url'] = None
        await update.message.reply_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown')
 async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
    query = update.callback_query
    await query.answer()
    if query.data == 'save_db':
        data = context.user_data.get('last_extracted')
        entry_type = context.user_data.get('last_entry_type', 'opportunity')
        if data:
            if context.user_data.get('last_source_kind') == 'text':
                context.user_data['awaiting_save_url'] = True
                context.user_data['pending_save_url'] = None
                await query.edit_message_text(
                    build_entry_summary(data, entry_type)
                    + "\n\nSend a source URL to attach it, or type /skip to save without one.",
                    parse_mode='Markdown'
                )
            else:
                url = context.user_data.get('last_source_value')
                upload_entry(data, entry_type, url)  # Pass URL to save with entry
                await query.edit_message_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown')
    elif query.data == 'retry':
        # Retry processing the last URL with the same entry type
        source_value = context.user_data.get('last_source_value')
        source_kind = context.user_data.get('last_source_kind', 'url')
        entry_type = context.user_data.get('last_entry_type', 'opportunity')
        if source_value:
            await query.edit_message_text("⏳ Retrying...")
            await task_queue.put((update, context, source_value, entry_type, source_kind))
        else:
            await query.edit_message_text("❌ No source content to retry.")
    else:
        await query.edit_message_text("🗑️ Discarded.")
 # --- Main Entry ---
 if __name__ == '__main__':
    application = ApplicationBuilder().token(TOKEN).build()
    # Add Handlers
    application.add_handler(CommandHandler("start", start))
    application.add_handler(CommandHandler("op", handle_opportunity))
    application.add_handler(CommandHandler("ev", handle_event))
    application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_followup_text))
    application.add_handler(CallbackQueryHandler(button_handler))
    # Start the worker thread
    loop = asyncio.get_event_loop()
    loop.create_task(worker())
    print("🤖 Bot is running...")
    application.run_polling()
--- a/database.py
+++ b/database.py
@@ -0,0 +1,97 @@
 import os
 from dotenv import load_dotenv
 from pocketbase import PocketBase
 from schemas import EntrySchema
 from datetime import datetime
 load_dotenv()
 pb = PocketBase(os.getenv('POCKETBASE_URL'))
 admin_data = pb.admins.auth_with_password(os.getenv('POCKETBASE_ADMIN_EMAIL'), os.getenv('POCKETBASE_ADMIN_PASSWORD'))
 def convert_datetime_to_pocketbase(date_time_str):
    """
    Convert datetime string from DD-MM-YYYY HH:MM format to PocketBase datetime format.
    PocketBase (Local) expects: YYYY-MM-DD HH:MM:SS
    """
    if date_time_str == 'N/A' or not date_time_str:
        return None
    try:
        print(f"[DEBUG] Converting datetime: '{date_time_str}' (type: {type(date_time_str)})")
        # Parse the input format: "DD-MM-YYYY HH:MM" or "DD-MM-YYYY (HH:MM)"
        date_time_str = date_time_str.replace("(", "").replace(")", "").strip()
        # Try with time first
        if " " in date_time_str:
            dt = datetime.strptime(date_time_str, "%d-%m-%Y %H:%M")
        else:
            # If only date is provided, set time to 00:00
            dt = datetime.strptime(date_time_str, "%d-%m-%Y")
        # Convert to PocketBase local datetime format: YYYY-MM-DD HH:MM:SS
        pb_format = dt.strftime("%Y-%m-%d %H:%M:%S")
        print(f"[DEBUG] Converted to PocketBase format: '{pb_format}'")
        return pb_format
    except Exception as e:
        print(f"[ERROR] Error converting datetime '{date_time_str}': {e}")
        import traceback
        traceback.print_exc()
        return None
 def upload_entry(data, entry_type='opportunity', url=None):
    """
    Upload entry to the appropriate PocketBase collection.
    Args:
        data: Dictionary containing the entry data
        entry_type: 'opportunity' or 'event'
        url: The source URL of the entry
    """
    print(f"[DEBUG] Uploading {entry_type} entry. Data: {data}")
    data = dict(data)
    # Add URL to data if provided
    if url:
        data['url'] = url
        print(f"[DEBUG] Added URL: {url}")
    try:
        if entry_type == 'event':
            # Map 'date_time' from agent to 'datetime' for PocketBase
            if 'date_time' in data:
                original_dt = data['date_time']
                # Convert and map to PocketBase field name
                data['datetime'] = convert_datetime_to_pocketbase(data['date_time'])
                # Remove the original field since PocketBase expects 'datetime'
                del data['date_time']
                print(f"[DEBUG] Event datetime: '{original_dt}' -> '{data['datetime']}'")
            else:
                print(f"[WARNING] No 'date_time' field found in event data")
            # Upload to events collection
            print(f"[DEBUG] Creating record in 'events' collection with data: {data}")
            result = pb.collection('events').create(data)
            print(f"[DEBUG] Successfully created record: {result}")
            return result
        else:
            # Opportunities - convert deadline to datetime format
            if 'deadline' in data:
                original_deadline = data['deadline']
                # Convert deadline to PocketBase datetime format
                data['deadline'] = convert_datetime_to_pocketbase(data['deadline'])
                print(f"[DEBUG] Opportunity deadline: '{original_deadline}' -> '{data['deadline']}'")
            else:
                print(f"[WARNING] No 'deadline' field found in opportunity data")
            # Upload to opportunities collection
            print(f"[DEBUG] Creating record in 'opportunities' collection with data: {data}")
            result = pb.collection('opportunities').create(data)
            print(f"[DEBUG] Successfully created record: {result}")
            return result
    except Exception as e:
        print(f"[ERROR] Failed to upload entry to PocketBase: {e}")
        import traceback
        traceback.print_exc()
        raise
--- a/prompts.py
+++ b/prompts.py
@@ -0,0 +1,55 @@
 # Central place for agent system prompts
 OPPORTUNITY_PROMPT = (
    "You are a precise Data Extraction Specialist. Your goal is to convert "
    "unstructured arts opportunity text into a strictly valid JSON object.\n\n"
    "# TASK\n"
    "Analyze the provided text and extract information into these JSON keys:\n"
    "1. 'title': The title of the opportunity\n"
    "2. 'org': The name of the organizing body/bodies\n"
    "3. 'type': The category (e.g., Residency, Funding, Open Call, Workshop).\n"
    "4. 'summary': A 3-sentence description of what the opportunity involves.\n"
    "5. 'deadline': The deadline of the opportunity. Format: DD-MM-YYYY. Assume year 2026 if missing.\n"
    "6. 'location': The physical city/country or 'Online'.\n\n"
    "# CONSTRAINTS\n"
    "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n"
    "- Do NOT include any introductory or conversational text.\n"
    "- If a field is missing, use 'N/A'.\n\n"
    "# EXAMPLE OUTPUT\n"
    "```json\n"
    "{\n"
    "  \"title\": \"Digital Horizons 2026\",\n"
    "  \"org\": \"Digital Horizons\",\n"
    "  \"type\": \"Residency\",\n"
    "  \"summary\": \"A residency for digital artists to explore VR. Includes a stipend.\",\n"
    "  \"deadline\": \"15-11-2026\",\n"
    "  \"location\": \"Berlin, Germany\"\n"
    "}\n"
    "```"
 )
 EVENT_PROMPT = (
    "You are a precise Data Extraction Specialist. Your goal is to convert "
    "unstructured event text into a strictly valid JSON object.\n\n"
    "# TASK\n"
    "Analyze the provided text and extract information into these JSON keys:\n"
    "1. 'title': The name/title of the event\n"
    "2. 'org': The name of the organizing body/bodies\n"
    "3. 'date_time': The date and time of the event. Format: DD-MM-YYYY (HH:MM) or 'N/A' if not specified.\n"
    "4. 'summary': A 3-sentence description of what the event is about.\n"
    "5. 'location': The physical venue/city/country or 'Online'.\n\n"
    "# CONSTRAINTS\n"
    "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n"
    "- Do NOT include any introductory or conversational text.\n"
    "- If a field is missing, use 'N/A'.\n\n"
    "# EXAMPLE OUTPUT\n"
    "```json\n"
    "{\n"
    "  \"title\": \"Digital Arts Symposium 2026\",\n"
    "  \"org\": \"Digital Arts Society\",\n"
    "  \"date_time\": \"20-06-2026 14:00\",\n"
    "  \"summary\": \"Join us for a day of talks and workshops exploring digital art. Meet artists and curators. Includes lunch and networking.\",\n"
    "  \"location\": \"London, UK\"\n"
    "}\n"
    "```"
 )
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,189 @@
 ag-ui-protocol==0.1.18
 aiofile==3.9.0
 aiofiles==25.1.0
 aiohappyeyeballs==2.6.1
 aiohttp==3.13.5
 aiosignal==1.4.0
 aiosqlite==0.22.1
 alphashape==1.3.1
 annotated-doc==0.0.4
 annotated-types==0.7.0
 anthropic==0.100.0
 anyio==4.13.0
 argcomplete==3.6.3
 attrs==26.1.0
 Authlib==1.7.2
 beartype==0.22.9
 beautifulsoup4==4.14.3
 boto3==1.43.6
 botocore==1.43.6
 brotli==1.2.0
 cachetools==7.1.1
 caio==0.9.25
 certifi==2026.4.22
 cffi==2.0.0
 chardet==7.4.3
 charset-normalizer==3.4.7
 click==8.3.3
 click-log==0.4.0
 cohere==5.21.1
 colorama==0.4.6
 Crawl4AI==0.8.6
 cryptography==48.0.0
 cssselect==1.4.0
 cyclopts==4.11.2
 distro==1.9.0
 dnspython==2.8.0
 docstring_parser==0.18.0
 docutils==0.22.4
 email-validator==2.3.0
 eval_type_backport==0.3.1
 exceptiongroup==1.3.1
 executing==2.2.1
 fake-useragent==2.2.0
 fastavro==1.12.2
 fastmcp==3.2.4
 fastuuid==0.14.0
 filelock==3.29.0
 frozenlist==1.8.0
 fsspec==2026.4.0
 genai-prices==0.0.59
 google-auth==2.52.0
 google-genai==2.0.0
 googleapis-common-protos==1.75.0
 greenlet==3.5.0
 griffelib==2.0.2
 groq==1.2.0
 grpcio==1.80.0
 h11==0.16.0
 h2==4.3.0
 hf-xet==1.5.0
 hpack==4.1.0
 httpcore==1.0.9
 httpx==0.28.1
 httpx-sse==0.4.3
 huggingface_hub==1.14.0
 humanize==4.15.0
 hyperframe==6.1.0
 idna==3.13
 importlib_metadata==8.7.1
 jaraco.classes==3.4.0
 jaraco.context==6.1.2
 jaraco.functools==4.4.0
 Jinja2==3.1.6
 jiter==0.14.0
 jmespath==1.1.0
 joblib==1.5.3
 joserfc==1.6.5
 jsonpath-python==1.1.6
 jsonref==1.1.0
 jsonschema==4.26.0
 jsonschema-path==0.4.6
 jsonschema-specifications==2025.9.1
 keyring==25.7.0
 lark==1.3.1
 logfire==4.32.1
 logfire-api==4.32.1
 lxml==5.4.0
 markdown-it-py==4.2.0
 markdownify==1.2.2
 MarkupSafe==3.0.3
 mcp==1.27.1
 mdurl==0.1.2
 mistralai==2.4.5
 more-itertools==11.0.2
 multidict==6.7.1
 networkx==3.6.1
 nexus-rpc==1.4.0
 nltk==3.9.4
 numpy==2.4.4
 openai==2.36.0
 openapi-pydantic==0.5.1
 opentelemetry-api==1.39.1
 opentelemetry-exporter-otlp-proto-common==1.39.1
 opentelemetry-exporter-otlp-proto-http==1.39.1
 opentelemetry-instrumentation==0.60b1
 opentelemetry-instrumentation-httpx==0.60b1
 opentelemetry-proto==1.39.1
 opentelemetry-sdk==1.39.1
 opentelemetry-semantic-conventions==0.60b1
 opentelemetry-util-http==0.60b1
 packaging==25.0
 patchright==1.59.1
 pathable==0.5.0
 pillow==12.2.0
 platformdirs==4.9.6
 playwright==1.59.0
 playwright-stealth==2.0.3
 pocketbase==0.17.1
 prompt_toolkit==3.0.52
 propcache==0.5.2
 protobuf==6.33.6
 psutil==7.2.2
 py-key-value-aio==0.4.4
 pyasn1==0.6.3
 pyasn1_modules==0.4.2
 pycparser==3.0
 pydantic==2.13.4
 pydantic-ai==1.92.0
 pydantic-ai-slim==1.92.0
 pydantic-evals==1.92.0
 pydantic-graph==1.92.0
 pydantic-handlebars==0.1.0
 pydantic-settings==2.14.1
 pydantic_core==2.46.4
 pyee==13.0.1
 Pygments==2.20.0
 PyJWT==2.12.1
 pyOpenSSL==26.2.0
 pyperclip==1.11.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.2.2
 python-multipart==0.0.27
 python-telegram-bot==22.7
 pywin32==311
 pywin32-ctypes==0.2.3
 PyYAML==6.0.3
 rank-bm25==0.2.2
 referencing==0.37.0
 regex==2026.4.4
 requests==2.33.1
 rich==15.0.0
 rich-rst==1.3.2
 rpds-py==0.30.0
 rtree==1.4.1
 s3transfer==0.17.0
 scipy==1.17.1
 setuptools==82.0.1
 shapely==2.1.2
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1
 snowballstemmer==2.2.0
 soupsieve==2.8.3
 sse-starlette==3.4.2
 starlette==1.0.0
 temporalio==1.27.0
 tenacity==9.1.4
 tiktoken==0.12.0
 tokenizers==0.23.1
 tqdm==4.67.3
 trimesh==4.12.2
 typer==0.25.1
 types-protobuf==6.32.1.20260221
 types-requests==2.33.0.20260508
 typing-inspection==0.4.2
 typing_extensions==4.15.0
 uncalled-for==0.3.2
 unclecode-litellm==1.81.13
 urllib3==2.7.0
 uvicorn==0.46.0
 watchfiles==1.1.1
 wcwidth==0.7.0
 websockets==16.0
 wheel==0.46.3
 wrapt==1.17.3
 xai-sdk==1.12.2
 xxhash==3.7.0
 yarl==1.23.0
 zipp==3.23.1
--- a/schemas.py
+++ b/schemas.py
@@ -0,0 +1,21 @@
 from typing import Union, Literal
 from pydantic import BaseModel, Field
 ## File doesn't do anything, its just an outline for the schemas
 class BaseEntry(BaseModel):
    title:  str = Field(description="The name of the opportunity")
    org:    str = Field(description="The organisation")
    summary:str = Field(description="A 3 sentence summary of what this is")
 class Event(BaseEntry):
    type:       Literal["event"] = "event"
    date_time:  str = Field(description="Date and time of the event")
    location:   str = Field(description="Location of the event")
 class Opportunity(BaseEntry):
    type:       str = Field(description="The type of opportunity (Open Call, Funding, Residency, etc.)")
    deadline:   str = Field(description="What is the deadline in the format of dd-mm-yy")
    location:   str = Field(description="Location of entry")
 EntrySchema = Union[Event, Opportunity]
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,29 @@
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator
 async def get_clean_content(url: str):
    md_generator = DefaultMarkdownGenerator(
        options={
            "ignore_links": True,
            "ignore_images": True,
            "body_width": 0,
        }
    )
    browser_conf = BrowserConfig(
        # cdp_url="http://127.0.0.1:9222",  # Use your existing Chrome session
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
    run_conf = CrawlerRunConfig(
        cache_mode="bypass",
        word_count_threshold=10,
        excluded_tags=["nav", "script", "style"],
        markdown_generator=md_generator,
        delay_before_return_html=3.0,
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        magic=True
    )
    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(url=url, config=run_conf)
        return result.markdown