commit 4726582379144c30e6f27796ed18c8311accbc6a Author: Cailean Finn Date: Sun May 10 13:14:14 2026 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a7e9129 --- /dev/null +++ b/README.md @@ -0,0 +1,115 @@ +# null-bot + +> A small Telegram bot for extracting and saving opportunities and events from web pages or pasted text. Uses an LLM agent to parse content into structured JSON and stores entries in a local PocketBase instance. This bot uses all open-source tools. The LLM of choice is granite4.1:8b by IBM under their Apache 2.0 License. + +## Features +- Parse Opportunity (`/op`) and Event (`/ev`) entries from a URL or pasted text +- Two entry types with separate system prompts and JSON schemas (externalized to `prompts.py`) +- Follow-up prompt when users paste text: ask for a source URL only when saving +- Converts date/time to PocketBase-friendly format (`YYYY-MM-DD HH:MM:SS`) +- Retry decorator for robust LLM / network calls + +## Requirements +- Python 3.11+ recommended +- See `requirements.txt` for full dependency list + +## Setup +1. Clone the repo or copy files to your machine. +2. Create and activate a Python virtual environment: + +```bash +python -m venv .venv +# Windows +.venv\Scripts\activate +# macOS / Linux +source .venv/bin/activate +``` + +3. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +4. Environment variables +- Create a `.env` file in the project root with at minimum: + +``` +TG_TOKEN=your_telegram_bot_token_here +OLLAMA_BASE_URL=http://localhost:11434/v1 +ALLOWED_USERS=1234,5678 +POCKETBASE_URL=http://127.0.0.1:8090 +POCKETBASE_ADMIN_EMAIL=admin@example.com +POCKETBASE_ADMIN_PASSWORD=secret +``` + +- Notes: + - `ALLOWED_USERS` should be a comma-separated list of Telegram user IDs (no brackets). + - The bot reads `TG_TOKEN` and `ALLOWED_USERS` from the environment. + +6. Ollama (local LLM) setup + +- This project uses a local Ollama instance (or any compatible local LLM HTTP API) as the LLM provider. The bot expects an HTTP endpoint available at `OLLAMA_BASE_URL` (default `http://localhost:11434/v1`). + +- Quick steps to get Ollama running locally: + + 1. Install Ollama for your platform — follow the official instructions: https://ollama.com/docs (or use the native installer for Windows/macOS/Linux). + + 2. Pull or install a model you want to use. Example (CLI): + + ```bash + ollama pull granite4.1:8b + ``` + + 3. Start the Ollama daemon / HTTP API so the bot can reach it. Depending on your Ollama installation this may be: + + ```bash + # example commands — consult your Ollama docs if these differ + ollama serve + # or + ollama daemon + ``` + + 4. Set `OLLAMA_BASE_URL` in your `.env` to point to the running API, for example: + + ```text + OLLAMA_BASE_URL=http://localhost:11434/v1 + ``` + + 5. Verify the API is reachable (example curl): + + ```bash + curl -s -X POST "${OLLAMA_BASE_URL}/completions" \ + -H "Content-Type: application/json" \ + -d '{"model":"","prompt":"hello","max_tokens":16}' + ``` + + A successful response indicates your Ollama HTTP API is reachable and can serve model requests. + +- Notes and troubleshooting + - If your Ollama installation exposes a different port or path, update `OLLAMA_BASE_URL` accordingly. + - If you prefer hosted LLMs (OpenAI, Anthropic, Cohere, etc.), `agent.py` can be adapted to use other providers; ensure the provider client is configured and the prompts in `prompts.py` are compatible. + +## Running the bot + +Start the bot with the project's entrypoint (example): + +```bash +python bot.py +``` + +The bot listens for commands: +- `/op ` — parse an opportunity +- `/ev ` — parse an event + +If you paste text (instead of sending a URL), the bot will parse it and when you click Save it will prompt you for a source URL (or you can `/skip`). + +## How it works (high-level) +- `agent.py` uses `pydantic-ai` + a local LLM provider (e.g. Ollama) and system prompts from `prompts.py` to parse pages/text into structured JSON. +- `database.py` converts datetime fields and uploads the entry to the appropriate PocketBase collection (`events` or `opportunities`). +- `bot.py` handles Telegram interactions, queues parse tasks, and preserves per-user state in `context.user_data`. + +## Troubleshooting +- If dates show as `None` after save: verify PocketBase field names (`datetime` for events, `deadline` for opportunities) and ensure `.env` is configured. +- If the bot doesn't start: check `TG_TOKEN` is present and valid. +- If parsing fails or you see unexpected behavior, check logs printed to the console for `convert_datetime_to_pocketbase()` and `upload_entry()` debug messages. \ No newline at end of file diff --git a/__pycache__/agent.cpython-312.pyc b/__pycache__/agent.cpython-312.pyc new file mode 100644 index 0000000..2df0bf8 Binary files /dev/null and b/__pycache__/agent.cpython-312.pyc differ diff --git a/__pycache__/database.cpython-312.pyc b/__pycache__/database.cpython-312.pyc new file mode 100644 index 0000000..db655c2 Binary files /dev/null and b/__pycache__/database.cpython-312.pyc differ diff --git a/__pycache__/prompts.cpython-312.pyc b/__pycache__/prompts.cpython-312.pyc new file mode 100644 index 0000000..3230e72 Binary files /dev/null and b/__pycache__/prompts.cpython-312.pyc differ diff --git a/__pycache__/schemas.cpython-312.pyc b/__pycache__/schemas.cpython-312.pyc new file mode 100644 index 0000000..58410ba Binary files /dev/null and b/__pycache__/schemas.cpython-312.pyc differ diff --git a/__pycache__/scraper.cpython-312.pyc b/__pycache__/scraper.cpython-312.pyc new file mode 100644 index 0000000..0995313 Binary files /dev/null and b/__pycache__/scraper.cpython-312.pyc differ diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..301b6c3 --- /dev/null +++ b/agent.py @@ -0,0 +1,68 @@ +from pydantic_ai import Agent +from pydantic_ai.models.ollama import OllamaModel +from pydantic_ai.providers.ollama import OllamaProvider +from dotenv import load_dotenv +import os +import json +from prompts import OPPORTUNITY_PROMPT, EVENT_PROMPT + +load_dotenv() + +ollama_url = os.getenv("OLLAMA_BASE_URL") + +prov = OllamaProvider(base_url=ollama_url) + +# Use qwen2.5:3b or phi4-mini for low-end hardware (RAM < 8GB) +model = OllamaModel( + model_name='granite4.1:8b', + provider=prov +) + +# --- OPPORTUNITY AGENT --- +opportunity_agent = Agent( + model, + output_type=str, + system_prompt=OPPORTUNITY_PROMPT, + retries=5 +) + +# --- EVENT AGENT --- +event_agent = Agent( + model, + output_type=str, + system_prompt=EVENT_PROMPT, + retries=5 +) + +async def parse_page(content: str, entry_type: str = "opportunity"): + """ + Parse content and extract entry data based on type. + + Args: + content: The raw text content to parse + entry_type: Either 'opportunity' or 'event' + """ + # Select the appropriate agent + agent = opportunity_agent if entry_type == "opportunity" else event_agent + + # 1. Run the agent (which returns a string) + print(f"Parsing {entry_type}...") + print(content) + result = await agent.run(content) + raw_text = result.output + print(raw_text) + + # 2. Clean the string + # We remove the markdown decorators so json.loads doesn't crash + clean_json = raw_text.replace("```json", "").replace("```", "").strip() + + try: + # 3. Convert string to a dictionary + data_dict = json.loads(clean_json) + + # 4. Success! return the dictionary to main.py + return data_dict + + except json.JSONDecodeError as e: + print(f"Critical Error: The AI sent invalid JSON. Text was: {raw_text}") + raise e diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..4799ddf --- /dev/null +++ b/bot.py @@ -0,0 +1,279 @@ +import os +import asyncio +import logging +from dotenv import load_dotenv +from functools import wraps +from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup +from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, filters, ContextTypes, CallbackQueryHandler + +# Import your existing logic +from agent import parse_page +from database import upload_entry +from scraper import get_clean_content + +load_dotenv() + +# Configuration +TOKEN = os.getenv("TG_TOKEN") +_allowed_env = os.getenv("ALLOWED_USERS", "") +if _allowed_env: + try: + ALLOWED_IDS = [int(x.strip()) for x in _allowed_env.split(',') if x.strip()] + except Exception: + logging.warning("Failed to parse ALLOWED_USERS from .env; defaulting to empty list") + ALLOWED_IDS = [] +else: + ALLOWED_IDS = [] + +if not TOKEN: + logging.warning("TG_TOKEN not set in .env; bot will not start without a token") + +# Setup Logging +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) + +# --- Retry Function with Exponential Backoff --- +def retry(max_attempts=3, backoff_factor=2, initial_delay=1): + """ + Decorator for retrying async functions with exponential backoff. + + Args: + max_attempts: Maximum number of retry attempts + backoff_factor: Multiplier for delay between retries + initial_delay: Initial delay in seconds + """ + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + delay = initial_delay + last_exception = None + + for attempt in range(1, max_attempts + 1): + try: + return await func(*args, **kwargs) + except Exception as e: + last_exception = e + if attempt < max_attempts: + logging.warning(f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: {str(e)}") + await asyncio.sleep(delay) + delay *= backoff_factor + else: + logging.error(f"All {max_attempts} attempts failed for {func.__name__}") + + raise last_exception + return wrapper + return decorator + +# --- The Queue System --- +# This ensures only ONE crawl/parse happens at a time to save RAM +task_queue = asyncio.Queue() + + +def build_entry_summary(data, entry_type, saved=False): + if entry_type == "event": + event_datetime = data.get('date_time') or data.get('datetime') + return ( + f"✅ **{data.get('title', 'Unknown')}**\n" + f"🦆 Org/s: {data.get('org')}\n" + f"📅 Date & Time: {event_datetime}\n" + f"📍 Location: {data.get('location')}\n" + f"🐊 Summary: {data.get('summary')}" + + ("\n\n💾 **Saved to PocketBase!**" if saved else "") + ) + + return ( + f"✅ **{data.get('title', 'Unknown')}**\n" + f"🦆 Org/s: {data.get('org')}\n" + f"📋 Type: {data.get('type')}\n" + f"🦢 Deadline: {data.get('deadline')}\n" + f"☁️ Location: {data.get('location')}\n" + f"🐊 Summary: {data.get('summary')}" + + ("\n\n💾 **Saved to PocketBase!**" if saved else "") + ) + +async def worker(): + while True: + # Get a 'task' from the queue + update, context, source_value, entry_type, source_kind = await task_queue.get() + try: + await process_link(update, context, source_value, entry_type, source_kind) + finally: + task_queue.task_done() + +async def process_link(update, context, source_value, entry_type="opportunity", source_kind="url"): + # Handle both message and callback query contexts + if update.message: + status_msg = await update.message.reply_text(f"⏳ Crawling & Analyzing...") + elif update.callback_query: + status_msg = update.callback_query.message + await status_msg.edit_text(f"⏳ Crawling & Analyzing...") + else: + logging.error("Could not determine message context for status update") + return + + # Store source and type for potential retry/save + context.user_data['last_source_value'] = source_value + context.user_data['last_source_kind'] = source_kind + context.user_data['last_entry_type'] = entry_type + + try: + if source_kind == "url": + markdown = await get_clean_content(source_value) + else: + markdown = source_value + + extracted_data = await parse_page(markdown, entry_type) + + if not extracted_data: + await status_msg.edit_text("❌ Failed to extract data from that page.") + return + + # 2. Store data temporarily in context to 'Save' later + context.user_data['last_extracted'] = extracted_data + context.user_data['awaiting_save_url'] = False + context.user_data['pending_save_url'] = None + + # 3. Show Result with Buttons - format depends on entry type + summary = build_entry_summary(extracted_data, entry_type) + + keyboard = [ + [InlineKeyboardButton("💾 Save to DB", callback_data='save_db')], + [InlineKeyboardButton("🗑️ Discard", callback_data='discard')], + [InlineKeyboardButton("🔄️ Retry", callback_data='retry')] + ] + reply_markup = InlineKeyboardMarkup(keyboard) + + await status_msg.edit_text(summary, reply_markup=reply_markup, parse_mode='Markdown') + + except Exception as e: + await status_msg.edit_text(f"⚠️ Error: {str(e)}") + +# --- Handlers --- + +async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): + if update.effective_user.id not in ALLOWED_IDS: + return + await update.message.reply_text( + "Welcome! I can extract arts opportunities and events.\n\n" + "📋 **Commands:**\n" + "/op - Extract an opportunity\n" + "/ev - Extract an event" + ) + +async def handle_opportunity(update: Update, context: ContextTypes.DEFAULT_TYPE): + user_id = update.effective_user.id + if user_id not in ALLOWED_IDS: + await update.message.reply_text("Unauthorized. User ID needs to be added!") + return + + if not context.args: + await update.message.reply_text("Please provide a URL or paste text. Usage: /op ") + return + + input_text = " ".join(context.args).strip() + if not input_text: + await update.message.reply_text("Please provide a URL or paste text. Usage: /op ") + return + + source_kind = "url" if input_text.startswith("http") else "text" + + await update.message.reply_text("📥 Link queued for processing...") + await task_queue.put((update, context, input_text, "opportunity", source_kind)) + +async def handle_event(update: Update, context: ContextTypes.DEFAULT_TYPE): + user_id = update.effective_user.id + if user_id not in ALLOWED_IDS: + await update.message.reply_text("Unauthorized. User ID needs to be added!") + return + + if not context.args: + await update.message.reply_text("Please provide a URL or paste text. Usage: /ev ") + return + + input_text = " ".join(context.args).strip() + if not input_text: + await update.message.reply_text("Please provide a URL or paste text. Usage: /ev ") + return + + source_kind = "url" if input_text.startswith("http") else "text" + + await update.message.reply_text("📥 Link queued for processing...") + await task_queue.put((update, context, input_text, "event", source_kind)) + +async def handle_followup_text(update: Update, context: ContextTypes.DEFAULT_TYPE): + if update.effective_user.id not in ALLOWED_IDS: + return + + if not context.user_data.get('awaiting_save_url'): + return + + text = update.message.text.strip() + if not text: + await update.message.reply_text("Please send a URL or type /skip to save without one.") + return + + if text.lower() == '/skip': + url = None + elif text.startswith('http'): + url = text + else: + await update.message.reply_text("Please send a valid URL or type /skip to save without one.") + return + + data = context.user_data.get('last_extracted') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if data: + upload_entry(data, entry_type, url) + context.user_data['awaiting_save_url'] = False + context.user_data['pending_save_url'] = None + await update.message.reply_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown') + +async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE): + query = update.callback_query + await query.answer() + + if query.data == 'save_db': + data = context.user_data.get('last_extracted') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if data: + if context.user_data.get('last_source_kind') == 'text': + context.user_data['awaiting_save_url'] = True + context.user_data['pending_save_url'] = None + await query.edit_message_text( + build_entry_summary(data, entry_type) + + "\n\nSend a source URL to attach it, or type /skip to save without one.", + parse_mode='Markdown' + ) + else: + url = context.user_data.get('last_source_value') + upload_entry(data, entry_type, url) # Pass URL to save with entry + await query.edit_message_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown') + elif query.data == 'retry': + # Retry processing the last URL with the same entry type + source_value = context.user_data.get('last_source_value') + source_kind = context.user_data.get('last_source_kind', 'url') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if source_value: + await query.edit_message_text("⏳ Retrying...") + await task_queue.put((update, context, source_value, entry_type, source_kind)) + else: + await query.edit_message_text("❌ No source content to retry.") + else: + await query.edit_message_text("🗑️ Discarded.") + +# --- Main Entry --- +if __name__ == '__main__': + application = ApplicationBuilder().token(TOKEN).build() + + # Add Handlers + application.add_handler(CommandHandler("start", start)) + application.add_handler(CommandHandler("op", handle_opportunity)) + application.add_handler(CommandHandler("ev", handle_event)) + application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_followup_text)) + application.add_handler(CallbackQueryHandler(button_handler)) + + # Start the worker thread + loop = asyncio.get_event_loop() + loop.create_task(worker()) + + print("🤖 Bot is running...") + application.run_polling() \ No newline at end of file diff --git a/database.py b/database.py new file mode 100644 index 0000000..58576f8 --- /dev/null +++ b/database.py @@ -0,0 +1,97 @@ +import os +from dotenv import load_dotenv +from pocketbase import PocketBase +from schemas import EntrySchema +from datetime import datetime + +load_dotenv() + +pb = PocketBase(os.getenv('POCKETBASE_URL')) +admin_data = pb.admins.auth_with_password(os.getenv('POCKETBASE_ADMIN_EMAIL'), os.getenv('POCKETBASE_ADMIN_PASSWORD')) + +def convert_datetime_to_pocketbase(date_time_str): + """ + Convert datetime string from DD-MM-YYYY HH:MM format to PocketBase datetime format. + PocketBase (Local) expects: YYYY-MM-DD HH:MM:SS + """ + if date_time_str == 'N/A' or not date_time_str: + return None + + try: + print(f"[DEBUG] Converting datetime: '{date_time_str}' (type: {type(date_time_str)})") + + # Parse the input format: "DD-MM-YYYY HH:MM" or "DD-MM-YYYY (HH:MM)" + date_time_str = date_time_str.replace("(", "").replace(")", "").strip() + + # Try with time first + if " " in date_time_str: + dt = datetime.strptime(date_time_str, "%d-%m-%Y %H:%M") + else: + # If only date is provided, set time to 00:00 + dt = datetime.strptime(date_time_str, "%d-%m-%Y") + + # Convert to PocketBase local datetime format: YYYY-MM-DD HH:MM:SS + pb_format = dt.strftime("%Y-%m-%d %H:%M:%S") + print(f"[DEBUG] Converted to PocketBase format: '{pb_format}'") + return pb_format + except Exception as e: + print(f"[ERROR] Error converting datetime '{date_time_str}': {e}") + import traceback + traceback.print_exc() + return None + +def upload_entry(data, entry_type='opportunity', url=None): + """ + Upload entry to the appropriate PocketBase collection. + + Args: + data: Dictionary containing the entry data + entry_type: 'opportunity' or 'event' + url: The source URL of the entry + """ + print(f"[DEBUG] Uploading {entry_type} entry. Data: {data}") + data = dict(data) + + # Add URL to data if provided + if url: + data['url'] = url + print(f"[DEBUG] Added URL: {url}") + + try: + if entry_type == 'event': + # Map 'date_time' from agent to 'datetime' for PocketBase + if 'date_time' in data: + original_dt = data['date_time'] + # Convert and map to PocketBase field name + data['datetime'] = convert_datetime_to_pocketbase(data['date_time']) + # Remove the original field since PocketBase expects 'datetime' + del data['date_time'] + print(f"[DEBUG] Event datetime: '{original_dt}' -> '{data['datetime']}'") + else: + print(f"[WARNING] No 'date_time' field found in event data") + + # Upload to events collection + print(f"[DEBUG] Creating record in 'events' collection with data: {data}") + result = pb.collection('events').create(data) + print(f"[DEBUG] Successfully created record: {result}") + return result + else: + # Opportunities - convert deadline to datetime format + if 'deadline' in data: + original_deadline = data['deadline'] + # Convert deadline to PocketBase datetime format + data['deadline'] = convert_datetime_to_pocketbase(data['deadline']) + print(f"[DEBUG] Opportunity deadline: '{original_deadline}' -> '{data['deadline']}'") + else: + print(f"[WARNING] No 'deadline' field found in opportunity data") + + # Upload to opportunities collection + print(f"[DEBUG] Creating record in 'opportunities' collection with data: {data}") + result = pb.collection('opportunities').create(data) + print(f"[DEBUG] Successfully created record: {result}") + return result + except Exception as e: + print(f"[ERROR] Failed to upload entry to PocketBase: {e}") + import traceback + traceback.print_exc() + raise \ No newline at end of file diff --git a/prompts.py b/prompts.py new file mode 100644 index 0000000..9d19bd8 --- /dev/null +++ b/prompts.py @@ -0,0 +1,55 @@ +# Central place for agent system prompts + +OPPORTUNITY_PROMPT = ( + "You are a precise Data Extraction Specialist. Your goal is to convert " + "unstructured arts opportunity text into a strictly valid JSON object.\n\n" + "# TASK\n" + "Analyze the provided text and extract information into these JSON keys:\n" + "1. 'title': The title of the opportunity\n" + "2. 'org': The name of the organizing body/bodies\n" + "3. 'type': The category (e.g., Residency, Funding, Open Call, Workshop).\n" + "4. 'summary': A 3-sentence description of what the opportunity involves.\n" + "5. 'deadline': The deadline of the opportunity. Format: DD-MM-YYYY. Assume year 2026 if missing.\n" + "6. 'location': The physical city/country or 'Online'.\n\n" + "# CONSTRAINTS\n" + "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n" + "- Do NOT include any introductory or conversational text.\n" + "- If a field is missing, use 'N/A'.\n\n" + "# EXAMPLE OUTPUT\n" + "```json\n" + "{\n" + " \"title\": \"Digital Horizons 2026\",\n" + " \"org\": \"Digital Horizons\",\n" + " \"type\": \"Residency\",\n" + " \"summary\": \"A residency for digital artists to explore VR. Includes a stipend.\",\n" + " \"deadline\": \"15-11-2026\",\n" + " \"location\": \"Berlin, Germany\"\n" + "}\n" + "```" +) + +EVENT_PROMPT = ( + "You are a precise Data Extraction Specialist. Your goal is to convert " + "unstructured event text into a strictly valid JSON object.\n\n" + "# TASK\n" + "Analyze the provided text and extract information into these JSON keys:\n" + "1. 'title': The name/title of the event\n" + "2. 'org': The name of the organizing body/bodies\n" + "3. 'date_time': The date and time of the event. Format: DD-MM-YYYY (HH:MM) or 'N/A' if not specified.\n" + "4. 'summary': A 3-sentence description of what the event is about.\n" + "5. 'location': The physical venue/city/country or 'Online'.\n\n" + "# CONSTRAINTS\n" + "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n" + "- Do NOT include any introductory or conversational text.\n" + "- If a field is missing, use 'N/A'.\n\n" + "# EXAMPLE OUTPUT\n" + "```json\n" + "{\n" + " \"title\": \"Digital Arts Symposium 2026\",\n" + " \"org\": \"Digital Arts Society\",\n" + " \"date_time\": \"20-06-2026 14:00\",\n" + " \"summary\": \"Join us for a day of talks and workshops exploring digital art. Meet artists and curators. Includes lunch and networking.\",\n" + " \"location\": \"London, UK\"\n" + "}\n" + "```" +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..93aeff6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,189 @@ +ag-ui-protocol==0.1.18 +aiofile==3.9.0 +aiofiles==25.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.5 +aiosignal==1.4.0 +aiosqlite==0.22.1 +alphashape==1.3.1 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.100.0 +anyio==4.13.0 +argcomplete==3.6.3 +attrs==26.1.0 +Authlib==1.7.2 +beartype==0.22.9 +beautifulsoup4==4.14.3 +boto3==1.43.6 +botocore==1.43.6 +brotli==1.2.0 +cachetools==7.1.1 +caio==0.9.25 +certifi==2026.4.22 +cffi==2.0.0 +chardet==7.4.3 +charset-normalizer==3.4.7 +click==8.3.3 +click-log==0.4.0 +cohere==5.21.1 +colorama==0.4.6 +Crawl4AI==0.8.6 +cryptography==48.0.0 +cssselect==1.4.0 +cyclopts==4.11.2 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.18.0 +docutils==0.22.4 +email-validator==2.3.0 +eval_type_backport==0.3.1 +exceptiongroup==1.3.1 +executing==2.2.1 +fake-useragent==2.2.0 +fastavro==1.12.2 +fastmcp==3.2.4 +fastuuid==0.14.0 +filelock==3.29.0 +frozenlist==1.8.0 +fsspec==2026.4.0 +genai-prices==0.0.59 +google-auth==2.52.0 +google-genai==2.0.0 +googleapis-common-protos==1.75.0 +greenlet==3.5.0 +griffelib==2.0.2 +groq==1.2.0 +grpcio==1.80.0 +h11==0.16.0 +h2==4.3.0 +hf-xet==1.5.0 +hpack==4.1.0 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==1.14.0 +humanize==4.15.0 +hyperframe==6.1.0 +idna==3.13 +importlib_metadata==8.7.1 +jaraco.classes==3.4.0 +jaraco.context==6.1.2 +jaraco.functools==4.4.0 +Jinja2==3.1.6 +jiter==0.14.0 +jmespath==1.1.0 +joblib==1.5.3 +joserfc==1.6.5 +jsonpath-python==1.1.6 +jsonref==1.1.0 +jsonschema==4.26.0 +jsonschema-path==0.4.6 +jsonschema-specifications==2025.9.1 +keyring==25.7.0 +lark==1.3.1 +logfire==4.32.1 +logfire-api==4.32.1 +lxml==5.4.0 +markdown-it-py==4.2.0 +markdownify==1.2.2 +MarkupSafe==3.0.3 +mcp==1.27.1 +mdurl==0.1.2 +mistralai==2.4.5 +more-itertools==11.0.2 +multidict==6.7.1 +networkx==3.6.1 +nexus-rpc==1.4.0 +nltk==3.9.4 +numpy==2.4.4 +openai==2.36.0 +openapi-pydantic==0.5.1 +opentelemetry-api==1.39.1 +opentelemetry-exporter-otlp-proto-common==1.39.1 +opentelemetry-exporter-otlp-proto-http==1.39.1 +opentelemetry-instrumentation==0.60b1 +opentelemetry-instrumentation-httpx==0.60b1 +opentelemetry-proto==1.39.1 +opentelemetry-sdk==1.39.1 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-util-http==0.60b1 +packaging==25.0 +patchright==1.59.1 +pathable==0.5.0 +pillow==12.2.0 +platformdirs==4.9.6 +playwright==1.59.0 +playwright-stealth==2.0.3 +pocketbase==0.17.1 +prompt_toolkit==3.0.52 +propcache==0.5.2 +protobuf==6.33.6 +psutil==7.2.2 +py-key-value-aio==0.4.4 +pyasn1==0.6.3 +pyasn1_modules==0.4.2 +pycparser==3.0 +pydantic==2.13.4 +pydantic-ai==1.92.0 +pydantic-ai-slim==1.92.0 +pydantic-evals==1.92.0 +pydantic-graph==1.92.0 +pydantic-handlebars==0.1.0 +pydantic-settings==2.14.1 +pydantic_core==2.46.4 +pyee==13.0.1 +Pygments==2.20.0 +PyJWT==2.12.1 +pyOpenSSL==26.2.0 +pyperclip==1.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.2 +python-multipart==0.0.27 +python-telegram-bot==22.7 +pywin32==311 +pywin32-ctypes==0.2.3 +PyYAML==6.0.3 +rank-bm25==0.2.2 +referencing==0.37.0 +regex==2026.4.4 +requests==2.33.1 +rich==15.0.0 +rich-rst==1.3.2 +rpds-py==0.30.0 +rtree==1.4.1 +s3transfer==0.17.0 +scipy==1.17.1 +setuptools==82.0.1 +shapely==2.1.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +snowballstemmer==2.2.0 +soupsieve==2.8.3 +sse-starlette==3.4.2 +starlette==1.0.0 +temporalio==1.27.0 +tenacity==9.1.4 +tiktoken==0.12.0 +tokenizers==0.23.1 +tqdm==4.67.3 +trimesh==4.12.2 +typer==0.25.1 +types-protobuf==6.32.1.20260221 +types-requests==2.33.0.20260508 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +uncalled-for==0.3.2 +unclecode-litellm==1.81.13 +urllib3==2.7.0 +uvicorn==0.46.0 +watchfiles==1.1.1 +wcwidth==0.7.0 +websockets==16.0 +wheel==0.46.3 +wrapt==1.17.3 +xai-sdk==1.12.2 +xxhash==3.7.0 +yarl==1.23.0 +zipp==3.23.1 diff --git a/schemas.py b/schemas.py new file mode 100644 index 0000000..568256b --- /dev/null +++ b/schemas.py @@ -0,0 +1,21 @@ +from typing import Union, Literal +from pydantic import BaseModel, Field + +## File doesn't do anything, its just an outline for the schemas + +class BaseEntry(BaseModel): + title: str = Field(description="The name of the opportunity") + org: str = Field(description="The organisation") + summary:str = Field(description="A 3 sentence summary of what this is") + +class Event(BaseEntry): + type: Literal["event"] = "event" + date_time: str = Field(description="Date and time of the event") + location: str = Field(description="Location of the event") + +class Opportunity(BaseEntry): + type: str = Field(description="The type of opportunity (Open Call, Funding, Residency, etc.)") + deadline: str = Field(description="What is the deadline in the format of dd-mm-yy") + location: str = Field(description="Location of entry") + +EntrySchema = Union[Event, Opportunity] \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..69a026e --- /dev/null +++ b/scraper.py @@ -0,0 +1,29 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator + +async def get_clean_content(url: str): + md_generator = DefaultMarkdownGenerator( + options={ + "ignore_links": True, + "ignore_images": True, + "body_width": 0, + } + ) + + browser_conf = BrowserConfig( + # cdp_url="http://127.0.0.1:9222", # Use your existing Chrome session + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + + run_conf = CrawlerRunConfig( + cache_mode="bypass", + word_count_threshold=10, + excluded_tags=["nav", "script", "style"], + markdown_generator=md_generator, + delay_before_return_html=3.0, + js_code="window.scrollTo(0, document.body.scrollHeight);", + magic=True + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun(url=url, config=run_conf) + return result.markdown \ No newline at end of file