From 4726582379144c30e6f27796ed18c8311accbc6a Mon Sep 17 00:00:00 2001 From: Cailean Finn Date: Sun, 10 May 2026 13:14:14 +0100 Subject: [PATCH] first commit --- .gitignore | 1 + README.md | 115 +++++++++++ __pycache__/agent.cpython-312.pyc | Bin 0 -> 2130 bytes __pycache__/database.cpython-312.pyc | Bin 0 -> 4174 bytes __pycache__/prompts.cpython-312.pyc | Bin 0 -> 2291 bytes __pycache__/schemas.cpython-312.pyc | Bin 0 -> 1659 bytes __pycache__/scraper.cpython-312.pyc | Bin 0 -> 1484 bytes agent.py | 68 +++++++ bot.py | 279 +++++++++++++++++++++++++++ database.py | 97 ++++++++++ prompts.py | 55 ++++++ requirements.txt | 189 ++++++++++++++++++ schemas.py | 21 ++ scraper.py | 29 +++ 14 files changed, 854 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 __pycache__/agent.cpython-312.pyc create mode 100644 __pycache__/database.cpython-312.pyc create mode 100644 __pycache__/prompts.cpython-312.pyc create mode 100644 __pycache__/schemas.cpython-312.pyc create mode 100644 __pycache__/scraper.cpython-312.pyc create mode 100644 agent.py create mode 100644 bot.py create mode 100644 database.py create mode 100644 prompts.py create mode 100644 requirements.txt create mode 100644 schemas.py create mode 100644 scraper.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a7e9129 --- /dev/null +++ b/README.md @@ -0,0 +1,115 @@ +# null-bot + +> A small Telegram bot for extracting and saving opportunities and events from web pages or pasted text. Uses an LLM agent to parse content into structured JSON and stores entries in a local PocketBase instance. This bot uses all open-source tools. The LLM of choice is granite4.1:8b by IBM under their Apache 2.0 License. + +## Features +- Parse Opportunity (`/op`) and Event (`/ev`) entries from a URL or pasted text +- Two entry types with separate system prompts and JSON schemas (externalized to `prompts.py`) +- Follow-up prompt when users paste text: ask for a source URL only when saving +- Converts date/time to PocketBase-friendly format (`YYYY-MM-DD HH:MM:SS`) +- Retry decorator for robust LLM / network calls + +## Requirements +- Python 3.11+ recommended +- See `requirements.txt` for full dependency list + +## Setup +1. Clone the repo or copy files to your machine. +2. Create and activate a Python virtual environment: + +```bash +python -m venv .venv +# Windows +.venv\Scripts\activate +# macOS / Linux +source .venv/bin/activate +``` + +3. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +4. Environment variables +- Create a `.env` file in the project root with at minimum: + +``` +TG_TOKEN=your_telegram_bot_token_here +OLLAMA_BASE_URL=http://localhost:11434/v1 +ALLOWED_USERS=1234,5678 +POCKETBASE_URL=http://127.0.0.1:8090 +POCKETBASE_ADMIN_EMAIL=admin@example.com +POCKETBASE_ADMIN_PASSWORD=secret +``` + +- Notes: + - `ALLOWED_USERS` should be a comma-separated list of Telegram user IDs (no brackets). + - The bot reads `TG_TOKEN` and `ALLOWED_USERS` from the environment. + +6. Ollama (local LLM) setup + +- This project uses a local Ollama instance (or any compatible local LLM HTTP API) as the LLM provider. The bot expects an HTTP endpoint available at `OLLAMA_BASE_URL` (default `http://localhost:11434/v1`). + +- Quick steps to get Ollama running locally: + + 1. Install Ollama for your platform — follow the official instructions: https://ollama.com/docs (or use the native installer for Windows/macOS/Linux). + + 2. Pull or install a model you want to use. Example (CLI): + + ```bash + ollama pull granite4.1:8b + ``` + + 3. Start the Ollama daemon / HTTP API so the bot can reach it. Depending on your Ollama installation this may be: + + ```bash + # example commands — consult your Ollama docs if these differ + ollama serve + # or + ollama daemon + ``` + + 4. Set `OLLAMA_BASE_URL` in your `.env` to point to the running API, for example: + + ```text + OLLAMA_BASE_URL=http://localhost:11434/v1 + ``` + + 5. Verify the API is reachable (example curl): + + ```bash + curl -s -X POST "${OLLAMA_BASE_URL}/completions" \ + -H "Content-Type: application/json" \ + -d '{"model":"","prompt":"hello","max_tokens":16}' + ``` + + A successful response indicates your Ollama HTTP API is reachable and can serve model requests. + +- Notes and troubleshooting + - If your Ollama installation exposes a different port or path, update `OLLAMA_BASE_URL` accordingly. + - If you prefer hosted LLMs (OpenAI, Anthropic, Cohere, etc.), `agent.py` can be adapted to use other providers; ensure the provider client is configured and the prompts in `prompts.py` are compatible. + +## Running the bot + +Start the bot with the project's entrypoint (example): + +```bash +python bot.py +``` + +The bot listens for commands: +- `/op ` — parse an opportunity +- `/ev ` — parse an event + +If you paste text (instead of sending a URL), the bot will parse it and when you click Save it will prompt you for a source URL (or you can `/skip`). + +## How it works (high-level) +- `agent.py` uses `pydantic-ai` + a local LLM provider (e.g. Ollama) and system prompts from `prompts.py` to parse pages/text into structured JSON. +- `database.py` converts datetime fields and uploads the entry to the appropriate PocketBase collection (`events` or `opportunities`). +- `bot.py` handles Telegram interactions, queues parse tasks, and preserves per-user state in `context.user_data`. + +## Troubleshooting +- If dates show as `None` after save: verify PocketBase field names (`datetime` for events, `deadline` for opportunities) and ensure `.env` is configured. +- If the bot doesn't start: check `TG_TOKEN` is present and valid. +- If parsing fails or you see unexpected behavior, check logs printed to the console for `convert_datetime_to_pocketbase()` and `upload_entry()` debug messages. \ No newline at end of file diff --git a/__pycache__/agent.cpython-312.pyc b/__pycache__/agent.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2df0bf8803fda22d606bfbc27a5d0d41c39db667 GIT binary patch literal 2130 zcmZt{OKcNIboS?6d;QtXhhGaUEg>;&tx}}|QdG%=R3wBr3ZZSRNM`X)gU#CO&aM;U z+LZ-S6{JeQrHO=)IMRly*B<%wP^p)sp^7ySwWvLmOXZXt`a5)HZO18eq`1dHNec=70eV#snEA&M*{bO*X^DITz%YveGpCQ}APRKc`#BVkw$%uWu4S=p7dGo6DtU{Bi)%p{0XCg)%gK?kLv zjm4yuCD}RHM*y03m6P>zb5DsnyF|XHUyw6D0T@#CB4aS;y8u8Q$5o*qD>xfgCDx%< z?lRaSa~&Am4X#lk1~jEb(Bu}`H0icyS5xB>F!+x+06t^qxK9KDl(-_7mY&rIpu}(T z&u{Y2r{(9h4Z-j<`s9D*;wE|EdHY7UmIN;2`)k`7&^V z4UKe518eSk0ctOa&Hpi^B7Ys+U>lwoP!umR2450C7N?jgaDf{GQ_Pp#6chFqzVc~g z4eQ8;TD{7eo;5Uh!9jY`(cFS-hVJMZ?Jz^LvKsA`h->G9D4MW)T#UM^JL%Cz&p{38 zQ<_6uHMp8%X*n8eL%rEhJz8JNIR}x}Y2|Vja`L2!r#m%^FyJIn>vU~M!#1S_i6)-P zPG|)#5{V2I#4~5kq-`sUL4hM^p}QBQoK#XbwLXL_R7)z_ui3<(mdZ}*X3Ee8PQ`{I z+9*wPO1FEoun()7f+Tc$k+JM1Q742PN)aB1@>%Su-5`2fa;BbySg^^h$zk42M4*@3 z#vvNy7)+9z=w=d@kV-0uXgRS^i{L@f&YKRF$f+ghsbOEzgnBmNI`YyQB#czj!3?x% zOViV)>#qa7J>#csi0tvCo-*P1F=(H6tlT)6F=?1|?D1^gG+#SwIpc1iNNyVY+$tn; zgn>GVuMYCr-xBaE_@fM}E#ReBZuHGXyQCb#I@`+)Cw z#CvBuXND@R+vW~mJ3Rm1a%;y*priawCDeYUbh%VMP8GUlVpR#W>{wCvEU9~LhHs55 ztFN!9hnCbsU$@_3m(|`CwSP(NUsg|)2dcbRP=0CKHaBo>V7{>2wtv=J^#Eo2qO`60 z0toK=bA1=^D=YHuC3*MnfEAQSQeaUzcIW*&1B=1f18KMt49$vvR|O*a$394Y=3*4| zwla5F=IB2DE*F&vaWAyxXcxHG!BV`dwO8Tpy`}WZ+g$< zNctSwR|%+c48yE~ovYx$Dmb_bc0B~%hd}-jbUz02OyH-$mf4XjW0%Jkw{|TD!e!rM zv3=+3R#e-6W}h0kr=k-~a#s literal 0 HcmV?d00001 diff --git a/__pycache__/database.cpython-312.pyc b/__pycache__/database.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db655c24b54a923747b45721b4789da88ba44fe0 GIT binary patch literal 4174 zcmbtXU2GHC6~1GS?eV{z#K9p=7zoBOkf7yf0WW0Nj?KRlhzT@KQrGcJz{c^oGh-mG zy^2;mg!ZA~iAXCX`j!={8mTXSj3}*A^`T?zRifEY?W+6Ix9q~hioW#TJD%9dLaVAH z+4tw1^WAgK-20t7f3;do06u@ugY*w}0RDjn>Z7lBUIPMv$AAG0kpVH(YBI!(CZ-`! zUz^d+kTDYX$&7ABAJgN$E@PN6#0&(`09Ny?4uwY~oB?Tvx=f>G^v~c*=uz94ku|2< zl)2{r^seTS(tj_iK}CD72Bm5|(v)@+brR;{1b_We1vSya0w>P za3Y(%3x}qwT=EVp1``5{gyF2n&qkBC*_i|!QcOZ*#ncQd+pZ1|UJhRm2BP8k=*Shh zy=fc>MJ^7-!;!$nD{{w==C1~#(VN2~p=3Qd*hLEdYvJ$ppYh8n!k2TW7XU$11OhRu zF#>T*o6#1*=YSzaq>rECR)XQY?ukwfp8!RW)~Q;JmeDf?H8t;)>zd?-hMn3|Uw;WO zR1w?-{4UX`%F=iPsw1U^(HPbBN<_B(ziOKr+C?I5-HwJvj9HDbq-n~GrKo`!us+eM zQkVfv6!ZYmref>|{E?Ej$;O7WX)kI&2T#a`9piw(Ir=KKQuPhFqq^s(y#=F}?iFWqte=(%pG@@1Kqlyd&BOG0X8Js1wC94~6Dd&Y zxZw1RVF{y1{GMpRw_^#G*%6Zx3S3PlyoH|e@W{yU$Rr)+d5))(O3Iquf?jF8ews%| z+Gmn=IhaFHCXtu2foF4>M3R+tD785rQJqY|eh!~i)SJdVWB7iOg_B%XHi|s7m`Wt? z$VQA6XYVI{I@yZy5J%c^XeMhI5z43HinlVs@`$ShbgIz1rQaSO^iPZmEH6wX6R8Y4 z5n_cqBA1)E%5!P-E+(@1Or~#&6DJ^WVhYkqe{NRpu16SGvmO_@cn&iUQsKKHz6CAx z9sHXEnsb74x1AXUCdz zU&Xoa=~?M$u<9Iq7<{7#ExXmp<%3n{+4&$eaP4}0`O#&GK2~izK5xXXoG$yyoht#! zcCJR9U$?s@%KgKJx$S#E)_py;99jN&rDr9v5|wNhYt*H$DEB($T%$TGRA-Iq+OW6& zYtsm|-wJLR^{;|m!CvsZ*EmRPpMT&P?9~3TQwRIWE#8Alh@CKRzJ&YYaWhL2MFMgG zkng9 zeM+0vxklTlB_*Z+n{w4w)Njuf<6_z~hLWLZNIO-lM%$<*p8=-5 zr2kakR6}+Z)=h|JMsw(vLbK>Lf_PNC&C-cnj^`kCLB49P6-h3Wfy4tjPAL=EKEU4+ z{J4kE5EJ?7Pzq}$AR=TVF_Fq5zD3B&GDvI71W(14n&9PfIgS_e*_1fzr6FB=*}G8g z-BQl;89#j;0tsB6PqH+;Ako}(0~#w9TGR@m1cw$c*p#*V=@52N)9i_Ewff|aNa}_^3Z&_!Dd7ZE z#b@gS27+>f~M$(r1hY-%J@ z)+_np+o2w=zdR}sJ__2vZ6cc*o0F#zl$qmGw^G?eCJw8&o@a%8MwFe6$$E6PoN z2P@%GDRe`+aZ_sl>6ebNZ;|*?&2a_RZEK6vHnb|PPD?GLHOq|+ShFV|O+GzUbsb)F z9jmyGEr+VE)AQ#xEKaHIbooHpEwuz|mcjMbj`;|z@s8b#*DLnD>-&3`c31WvmmGT) z#|o9;NVO+gcMLmpQaW`}I(cdJfOKtCy7BYMwS;taN}5Va?aY@B7CSXub6l%C1yy3w z=(yA}QL{{{Zhc&J9b0qxD=vTec-8gEd)!iQ6gHgFf$}fPNy!$hQG+|~66=%|-z98s zg+1`K{UJz(PJrj90%u_J;)F4LK>K2#C%jktr@cDZ_i5mkX&kR1pIz2)f~;3Q3VC$F zkWEduGZIZ01pHMf@Tlf1S5Eyc7JWtXUL@Bin3+^okXsUY@pc?>K|GfbgnN)MWD}O5 z%U=TTc)Y*90I#7PP_`d+W`<+(8FqmG2+pC8Kj9Yq+td;S@f9$>0^P5`-dAAHE70`~ zFh8(=S<&|T}0cPfe#5A?}uys8}1q=Bme*a literal 0 HcmV?d00001 diff --git a/__pycache__/prompts.cpython-312.pyc b/__pycache__/prompts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3230e721979819d8dab6923689d2561729f14fcc GIT binary patch literal 2291 zcmeHJ%Wm676qVu{4TyNt0!6dxg}bl~Nc1c5LMRHTmQy>i9utG=r{C3y6PWz+aGX|UAyU-5#_o`x(QIAi*i8>Id=~4+jGp)I=me^&m_p1TdEx0MC>jGoPae zM>x{=h$CK@Mz9{zk`*@POAU&E<;SGXJn+2)gAD{KJuJf{8}qv;J!Dc`3pu1yl@zPE z32uT-hg@Vc;FwuHRJx#L9uC7A9dLu3Wn9$gSuPW-u2DDRlD1iz*616hN9J5*D`D^i zKqeoLnJ)14CT-RYmljJoCEUbXWPX3h`SP6EHxowKo* zua<=L%-60#tyaC=t{>wHscDc3r-Cz08*3X|B+h6oj6wcku!Rq#3OfZS%0gx5g%L5x z5TnbhvC5@I4V0#(uJn2MmTq@D{pg^%*NOT;9i;(DBy~Ic$9^~W2!P$VvJr|gO4OyK zQx=b`h)0H&PfkuQ49FOUAz_Tl)v2YZ(~YnyPID|{vcTACm4Jw@8AxQxPvetDo{mFq z<=z>XcqVw7I3AZA<; zK;BhaTq|jO-4<&u284Q7#B|b;$Iad`W%|0Lf}3fQp0S8PbV|xEL0yodS0BI1QU&cE z9fY(ut%C6q6X04BP9bM3aG2{)>+9=v*V^gateX}v*qcB*n{z}dyD`){flt;zZ7|C(;PbF5#CGi5|R{%2)R<5VoT zElSPJDBy&(u~uK(@}i^lCylkWx0L=|2?_70Z4=m6GEp#r!xyFgJ1Y-vKI-;* z-Gk_`vlkt|>K%03z3ArA&e2XMDxZF;ymo&Xzi)0g28V`gGl-c;`Jlzk$f|76)5;z2 zW+3x4t)D78z^OjYtO>K?=HVA(ICjdv)E@%F{jQkDxZTcGtJT|wpUwxjbCv4Hw~bHc MgWoV((DRu66PEGjYybcN literal 0 HcmV?d00001 diff --git a/__pycache__/schemas.cpython-312.pyc b/__pycache__/schemas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58410ba66eceadebb453c3787c755186fa7fd132 GIT binary patch literal 1659 zcmcgs&uQ^v8&Y!Wsqc;L#6mfD4&Tmu-^`oY_q{iN&CHYt zj6WmtcvmCj4H}bSj3);YCQk_?jQS*?HVtxij-s9OwLrJ^z_1NUa^xmq+8>1JTz?L+ zaM*d}HNcza`6F*Zc?;k*r@SVoyui(^-21q*m=tTJe=tZbml>O(7lm12!;@UNK6r}j zZp^=r822;dhR1!@#-)=nKAs#r0?$*ziA@=?bDMBUt}*%|_neNF+B!FQo@py!Do&r` z3W_t1xcn4nDz1QBV%CbI`!qu-lK#TI4!1%#;8wI{Ntj37ZX{$c^knjkW@X0Xw(z5@C zhUqwXvg2eW#|a|V^KpL4aenE!{)iEIxU(n&WLhjmHtRTU7)DZ^8#|6Th5QV_Gjae( z{=HKFss9sH1&V+25s=+k=}s^;Ewt<|jc;R@Vl4T3tqM zjq8kJ0da%n#>#=dFZ3hDp~QfjsG4W_DXyrvV^xjjuaN!B;Oe8xq|5u~8?NM*8!}6J z#~P~@q6i0PMFtBTB>QJKqPEKGq!dPhOn>aYQ!9zn_>Z#)a|n2Mp(9iflvw9+b_QVq z;XMSEnzJ}NHv!Z%{v6;5`TLX0gT}D1TX;EV?dz9cRTuUyHPUKxaBHaT>rG{UlUA<} zz8z|*eti@T1D%}c|E-P6caM9ezlvRfImhfxHIa4cZkLBv-Sz!N>qakRUbwwz{m5gF zL8By#7MJbini4~$v%h>F`vtlMyOD91`CiB^FI4tcBm!XJ2*#FzU@1v5bF!v?W?Dcf z1(7HTpw-^hO$nvMQ$i{6K15|slI#N{5K!_-bD&1sX|)5wfkqvr(Sx-55H;G@A1eD= zT3s1wwldNTgFqE!dB_u_LEy`$$|p0s33Ef~wFN%pqe47JuGQ-jqcn+YfV|J!x|c^Z%br$ Ue%Rc-wm1LT%hMNsFRPpX2MMx}&;S4c literal 0 HcmV?d00001 diff --git a/__pycache__/scraper.cpython-312.pyc b/__pycache__/scraper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0995313c97fd066a5396421ff345a7eca7133e34 GIT binary patch literal 1484 zcmZ8h-%le&9G}@=Ew#OtqY44G1&KwB`xpCGD^gCv$P|cN6io3$ zDW^1~l>CCLr6fN-?=>9b0Vym{y>1ig$FkI|dxS2yPSGm)i2#46<^)LJZBo>0He1%| zy6M)PJH#PWXD;={8nq1^#KV7G(08C8&j7lFFvQ2s=S>U`MD&%oFYN{B7G@$>qrjor*lr!!OP$kO8K^4(F@w$_PyhZyT_N?j;Zw?f7y zZrn(N39x!!FY43^RKRzn1|_6HP!98zLZhmC-V4qf>_Bw%_k6`Os8waY?6HPTN~dgw z-`X6jccM$fv}@a|?#TG4YPv?P0*2Cjcfrgeu}Wo@noWiLh@l&0qE%dz_{q9UP0es? z4%1kf60hvqrXM34hFvp>sWH9e`Ijp}Jpmy5dnT47ByTGuQH+0J;`P;->XKporkGgBbU|!y- z+4dU+m*s&-y-H}h+VHzeglUFN^o|1tjM8B+$s_f?gwG%7WE=g`^ZIYEs;w(St^R>O zLsEC7Eh?$WwC=qP(p3>479Ec`mc<)2Clb;UTY=hTbC}kb<(#{G$1d_;m5}=Cj_p=iPH3E3Np;pFR5I(O)u3Ov1vS z;+d0H_uR>m2;&m~;QcY4`Nzvad>_o?>;OIxaTX~DlE~#yZ&ncxuHvkq91O;}oQ`sQ z24^MZU{(S7P{4p6N?i9)QL=rZ!zj+iqld8&my;pDSD=i-9l9rVtCPA5ndG$lqQ_{5 zt#8pWj1j%Y(eN#(x|&Vir2O*bcbL};pCb`t{43I)A?--&$AeFL+6aF1I?VkGOTCa% literal 0 HcmV?d00001 diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..301b6c3 --- /dev/null +++ b/agent.py @@ -0,0 +1,68 @@ +from pydantic_ai import Agent +from pydantic_ai.models.ollama import OllamaModel +from pydantic_ai.providers.ollama import OllamaProvider +from dotenv import load_dotenv +import os +import json +from prompts import OPPORTUNITY_PROMPT, EVENT_PROMPT + +load_dotenv() + +ollama_url = os.getenv("OLLAMA_BASE_URL") + +prov = OllamaProvider(base_url=ollama_url) + +# Use qwen2.5:3b or phi4-mini for low-end hardware (RAM < 8GB) +model = OllamaModel( + model_name='granite4.1:8b', + provider=prov +) + +# --- OPPORTUNITY AGENT --- +opportunity_agent = Agent( + model, + output_type=str, + system_prompt=OPPORTUNITY_PROMPT, + retries=5 +) + +# --- EVENT AGENT --- +event_agent = Agent( + model, + output_type=str, + system_prompt=EVENT_PROMPT, + retries=5 +) + +async def parse_page(content: str, entry_type: str = "opportunity"): + """ + Parse content and extract entry data based on type. + + Args: + content: The raw text content to parse + entry_type: Either 'opportunity' or 'event' + """ + # Select the appropriate agent + agent = opportunity_agent if entry_type == "opportunity" else event_agent + + # 1. Run the agent (which returns a string) + print(f"Parsing {entry_type}...") + print(content) + result = await agent.run(content) + raw_text = result.output + print(raw_text) + + # 2. Clean the string + # We remove the markdown decorators so json.loads doesn't crash + clean_json = raw_text.replace("```json", "").replace("```", "").strip() + + try: + # 3. Convert string to a dictionary + data_dict = json.loads(clean_json) + + # 4. Success! return the dictionary to main.py + return data_dict + + except json.JSONDecodeError as e: + print(f"Critical Error: The AI sent invalid JSON. Text was: {raw_text}") + raise e diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..4799ddf --- /dev/null +++ b/bot.py @@ -0,0 +1,279 @@ +import os +import asyncio +import logging +from dotenv import load_dotenv +from functools import wraps +from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup +from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, filters, ContextTypes, CallbackQueryHandler + +# Import your existing logic +from agent import parse_page +from database import upload_entry +from scraper import get_clean_content + +load_dotenv() + +# Configuration +TOKEN = os.getenv("TG_TOKEN") +_allowed_env = os.getenv("ALLOWED_USERS", "") +if _allowed_env: + try: + ALLOWED_IDS = [int(x.strip()) for x in _allowed_env.split(',') if x.strip()] + except Exception: + logging.warning("Failed to parse ALLOWED_USERS from .env; defaulting to empty list") + ALLOWED_IDS = [] +else: + ALLOWED_IDS = [] + +if not TOKEN: + logging.warning("TG_TOKEN not set in .env; bot will not start without a token") + +# Setup Logging +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) + +# --- Retry Function with Exponential Backoff --- +def retry(max_attempts=3, backoff_factor=2, initial_delay=1): + """ + Decorator for retrying async functions with exponential backoff. + + Args: + max_attempts: Maximum number of retry attempts + backoff_factor: Multiplier for delay between retries + initial_delay: Initial delay in seconds + """ + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + delay = initial_delay + last_exception = None + + for attempt in range(1, max_attempts + 1): + try: + return await func(*args, **kwargs) + except Exception as e: + last_exception = e + if attempt < max_attempts: + logging.warning(f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: {str(e)}") + await asyncio.sleep(delay) + delay *= backoff_factor + else: + logging.error(f"All {max_attempts} attempts failed for {func.__name__}") + + raise last_exception + return wrapper + return decorator + +# --- The Queue System --- +# This ensures only ONE crawl/parse happens at a time to save RAM +task_queue = asyncio.Queue() + + +def build_entry_summary(data, entry_type, saved=False): + if entry_type == "event": + event_datetime = data.get('date_time') or data.get('datetime') + return ( + f"✅ **{data.get('title', 'Unknown')}**\n" + f"🦆 Org/s: {data.get('org')}\n" + f"📅 Date & Time: {event_datetime}\n" + f"📍 Location: {data.get('location')}\n" + f"🐊 Summary: {data.get('summary')}" + + ("\n\n💾 **Saved to PocketBase!**" if saved else "") + ) + + return ( + f"✅ **{data.get('title', 'Unknown')}**\n" + f"🦆 Org/s: {data.get('org')}\n" + f"📋 Type: {data.get('type')}\n" + f"🦢 Deadline: {data.get('deadline')}\n" + f"☁️ Location: {data.get('location')}\n" + f"🐊 Summary: {data.get('summary')}" + + ("\n\n💾 **Saved to PocketBase!**" if saved else "") + ) + +async def worker(): + while True: + # Get a 'task' from the queue + update, context, source_value, entry_type, source_kind = await task_queue.get() + try: + await process_link(update, context, source_value, entry_type, source_kind) + finally: + task_queue.task_done() + +async def process_link(update, context, source_value, entry_type="opportunity", source_kind="url"): + # Handle both message and callback query contexts + if update.message: + status_msg = await update.message.reply_text(f"⏳ Crawling & Analyzing...") + elif update.callback_query: + status_msg = update.callback_query.message + await status_msg.edit_text(f"⏳ Crawling & Analyzing...") + else: + logging.error("Could not determine message context for status update") + return + + # Store source and type for potential retry/save + context.user_data['last_source_value'] = source_value + context.user_data['last_source_kind'] = source_kind + context.user_data['last_entry_type'] = entry_type + + try: + if source_kind == "url": + markdown = await get_clean_content(source_value) + else: + markdown = source_value + + extracted_data = await parse_page(markdown, entry_type) + + if not extracted_data: + await status_msg.edit_text("❌ Failed to extract data from that page.") + return + + # 2. Store data temporarily in context to 'Save' later + context.user_data['last_extracted'] = extracted_data + context.user_data['awaiting_save_url'] = False + context.user_data['pending_save_url'] = None + + # 3. Show Result with Buttons - format depends on entry type + summary = build_entry_summary(extracted_data, entry_type) + + keyboard = [ + [InlineKeyboardButton("💾 Save to DB", callback_data='save_db')], + [InlineKeyboardButton("🗑️ Discard", callback_data='discard')], + [InlineKeyboardButton("🔄️ Retry", callback_data='retry')] + ] + reply_markup = InlineKeyboardMarkup(keyboard) + + await status_msg.edit_text(summary, reply_markup=reply_markup, parse_mode='Markdown') + + except Exception as e: + await status_msg.edit_text(f"⚠️ Error: {str(e)}") + +# --- Handlers --- + +async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): + if update.effective_user.id not in ALLOWED_IDS: + return + await update.message.reply_text( + "Welcome! I can extract arts opportunities and events.\n\n" + "📋 **Commands:**\n" + "/op - Extract an opportunity\n" + "/ev - Extract an event" + ) + +async def handle_opportunity(update: Update, context: ContextTypes.DEFAULT_TYPE): + user_id = update.effective_user.id + if user_id not in ALLOWED_IDS: + await update.message.reply_text("Unauthorized. User ID needs to be added!") + return + + if not context.args: + await update.message.reply_text("Please provide a URL or paste text. Usage: /op ") + return + + input_text = " ".join(context.args).strip() + if not input_text: + await update.message.reply_text("Please provide a URL or paste text. Usage: /op ") + return + + source_kind = "url" if input_text.startswith("http") else "text" + + await update.message.reply_text("📥 Link queued for processing...") + await task_queue.put((update, context, input_text, "opportunity", source_kind)) + +async def handle_event(update: Update, context: ContextTypes.DEFAULT_TYPE): + user_id = update.effective_user.id + if user_id not in ALLOWED_IDS: + await update.message.reply_text("Unauthorized. User ID needs to be added!") + return + + if not context.args: + await update.message.reply_text("Please provide a URL or paste text. Usage: /ev ") + return + + input_text = " ".join(context.args).strip() + if not input_text: + await update.message.reply_text("Please provide a URL or paste text. Usage: /ev ") + return + + source_kind = "url" if input_text.startswith("http") else "text" + + await update.message.reply_text("📥 Link queued for processing...") + await task_queue.put((update, context, input_text, "event", source_kind)) + +async def handle_followup_text(update: Update, context: ContextTypes.DEFAULT_TYPE): + if update.effective_user.id not in ALLOWED_IDS: + return + + if not context.user_data.get('awaiting_save_url'): + return + + text = update.message.text.strip() + if not text: + await update.message.reply_text("Please send a URL or type /skip to save without one.") + return + + if text.lower() == '/skip': + url = None + elif text.startswith('http'): + url = text + else: + await update.message.reply_text("Please send a valid URL or type /skip to save without one.") + return + + data = context.user_data.get('last_extracted') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if data: + upload_entry(data, entry_type, url) + context.user_data['awaiting_save_url'] = False + context.user_data['pending_save_url'] = None + await update.message.reply_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown') + +async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE): + query = update.callback_query + await query.answer() + + if query.data == 'save_db': + data = context.user_data.get('last_extracted') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if data: + if context.user_data.get('last_source_kind') == 'text': + context.user_data['awaiting_save_url'] = True + context.user_data['pending_save_url'] = None + await query.edit_message_text( + build_entry_summary(data, entry_type) + + "\n\nSend a source URL to attach it, or type /skip to save without one.", + parse_mode='Markdown' + ) + else: + url = context.user_data.get('last_source_value') + upload_entry(data, entry_type, url) # Pass URL to save with entry + await query.edit_message_text(build_entry_summary(data, entry_type, saved=True), parse_mode='Markdown') + elif query.data == 'retry': + # Retry processing the last URL with the same entry type + source_value = context.user_data.get('last_source_value') + source_kind = context.user_data.get('last_source_kind', 'url') + entry_type = context.user_data.get('last_entry_type', 'opportunity') + if source_value: + await query.edit_message_text("⏳ Retrying...") + await task_queue.put((update, context, source_value, entry_type, source_kind)) + else: + await query.edit_message_text("❌ No source content to retry.") + else: + await query.edit_message_text("🗑️ Discarded.") + +# --- Main Entry --- +if __name__ == '__main__': + application = ApplicationBuilder().token(TOKEN).build() + + # Add Handlers + application.add_handler(CommandHandler("start", start)) + application.add_handler(CommandHandler("op", handle_opportunity)) + application.add_handler(CommandHandler("ev", handle_event)) + application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_followup_text)) + application.add_handler(CallbackQueryHandler(button_handler)) + + # Start the worker thread + loop = asyncio.get_event_loop() + loop.create_task(worker()) + + print("🤖 Bot is running...") + application.run_polling() \ No newline at end of file diff --git a/database.py b/database.py new file mode 100644 index 0000000..58576f8 --- /dev/null +++ b/database.py @@ -0,0 +1,97 @@ +import os +from dotenv import load_dotenv +from pocketbase import PocketBase +from schemas import EntrySchema +from datetime import datetime + +load_dotenv() + +pb = PocketBase(os.getenv('POCKETBASE_URL')) +admin_data = pb.admins.auth_with_password(os.getenv('POCKETBASE_ADMIN_EMAIL'), os.getenv('POCKETBASE_ADMIN_PASSWORD')) + +def convert_datetime_to_pocketbase(date_time_str): + """ + Convert datetime string from DD-MM-YYYY HH:MM format to PocketBase datetime format. + PocketBase (Local) expects: YYYY-MM-DD HH:MM:SS + """ + if date_time_str == 'N/A' or not date_time_str: + return None + + try: + print(f"[DEBUG] Converting datetime: '{date_time_str}' (type: {type(date_time_str)})") + + # Parse the input format: "DD-MM-YYYY HH:MM" or "DD-MM-YYYY (HH:MM)" + date_time_str = date_time_str.replace("(", "").replace(")", "").strip() + + # Try with time first + if " " in date_time_str: + dt = datetime.strptime(date_time_str, "%d-%m-%Y %H:%M") + else: + # If only date is provided, set time to 00:00 + dt = datetime.strptime(date_time_str, "%d-%m-%Y") + + # Convert to PocketBase local datetime format: YYYY-MM-DD HH:MM:SS + pb_format = dt.strftime("%Y-%m-%d %H:%M:%S") + print(f"[DEBUG] Converted to PocketBase format: '{pb_format}'") + return pb_format + except Exception as e: + print(f"[ERROR] Error converting datetime '{date_time_str}': {e}") + import traceback + traceback.print_exc() + return None + +def upload_entry(data, entry_type='opportunity', url=None): + """ + Upload entry to the appropriate PocketBase collection. + + Args: + data: Dictionary containing the entry data + entry_type: 'opportunity' or 'event' + url: The source URL of the entry + """ + print(f"[DEBUG] Uploading {entry_type} entry. Data: {data}") + data = dict(data) + + # Add URL to data if provided + if url: + data['url'] = url + print(f"[DEBUG] Added URL: {url}") + + try: + if entry_type == 'event': + # Map 'date_time' from agent to 'datetime' for PocketBase + if 'date_time' in data: + original_dt = data['date_time'] + # Convert and map to PocketBase field name + data['datetime'] = convert_datetime_to_pocketbase(data['date_time']) + # Remove the original field since PocketBase expects 'datetime' + del data['date_time'] + print(f"[DEBUG] Event datetime: '{original_dt}' -> '{data['datetime']}'") + else: + print(f"[WARNING] No 'date_time' field found in event data") + + # Upload to events collection + print(f"[DEBUG] Creating record in 'events' collection with data: {data}") + result = pb.collection('events').create(data) + print(f"[DEBUG] Successfully created record: {result}") + return result + else: + # Opportunities - convert deadline to datetime format + if 'deadline' in data: + original_deadline = data['deadline'] + # Convert deadline to PocketBase datetime format + data['deadline'] = convert_datetime_to_pocketbase(data['deadline']) + print(f"[DEBUG] Opportunity deadline: '{original_deadline}' -> '{data['deadline']}'") + else: + print(f"[WARNING] No 'deadline' field found in opportunity data") + + # Upload to opportunities collection + print(f"[DEBUG] Creating record in 'opportunities' collection with data: {data}") + result = pb.collection('opportunities').create(data) + print(f"[DEBUG] Successfully created record: {result}") + return result + except Exception as e: + print(f"[ERROR] Failed to upload entry to PocketBase: {e}") + import traceback + traceback.print_exc() + raise \ No newline at end of file diff --git a/prompts.py b/prompts.py new file mode 100644 index 0000000..9d19bd8 --- /dev/null +++ b/prompts.py @@ -0,0 +1,55 @@ +# Central place for agent system prompts + +OPPORTUNITY_PROMPT = ( + "You are a precise Data Extraction Specialist. Your goal is to convert " + "unstructured arts opportunity text into a strictly valid JSON object.\n\n" + "# TASK\n" + "Analyze the provided text and extract information into these JSON keys:\n" + "1. 'title': The title of the opportunity\n" + "2. 'org': The name of the organizing body/bodies\n" + "3. 'type': The category (e.g., Residency, Funding, Open Call, Workshop).\n" + "4. 'summary': A 3-sentence description of what the opportunity involves.\n" + "5. 'deadline': The deadline of the opportunity. Format: DD-MM-YYYY. Assume year 2026 if missing.\n" + "6. 'location': The physical city/country or 'Online'.\n\n" + "# CONSTRAINTS\n" + "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n" + "- Do NOT include any introductory or conversational text.\n" + "- If a field is missing, use 'N/A'.\n\n" + "# EXAMPLE OUTPUT\n" + "```json\n" + "{\n" + " \"title\": \"Digital Horizons 2026\",\n" + " \"org\": \"Digital Horizons\",\n" + " \"type\": \"Residency\",\n" + " \"summary\": \"A residency for digital artists to explore VR. Includes a stipend.\",\n" + " \"deadline\": \"15-11-2026\",\n" + " \"location\": \"Berlin, Germany\"\n" + "}\n" + "```" +) + +EVENT_PROMPT = ( + "You are a precise Data Extraction Specialist. Your goal is to convert " + "unstructured event text into a strictly valid JSON object.\n\n" + "# TASK\n" + "Analyze the provided text and extract information into these JSON keys:\n" + "1. 'title': The name/title of the event\n" + "2. 'org': The name of the organizing body/bodies\n" + "3. 'date_time': The date and time of the event. Format: DD-MM-YYYY (HH:MM) or 'N/A' if not specified.\n" + "4. 'summary': A 3-sentence description of what the event is about.\n" + "5. 'location': The physical venue/city/country or 'Online'.\n\n" + "# CONSTRAINTS\n" + "- Return ONLY the JSON object inside markdown backticks (```json ... ```).\n" + "- Do NOT include any introductory or conversational text.\n" + "- If a field is missing, use 'N/A'.\n\n" + "# EXAMPLE OUTPUT\n" + "```json\n" + "{\n" + " \"title\": \"Digital Arts Symposium 2026\",\n" + " \"org\": \"Digital Arts Society\",\n" + " \"date_time\": \"20-06-2026 14:00\",\n" + " \"summary\": \"Join us for a day of talks and workshops exploring digital art. Meet artists and curators. Includes lunch and networking.\",\n" + " \"location\": \"London, UK\"\n" + "}\n" + "```" +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..93aeff6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,189 @@ +ag-ui-protocol==0.1.18 +aiofile==3.9.0 +aiofiles==25.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.5 +aiosignal==1.4.0 +aiosqlite==0.22.1 +alphashape==1.3.1 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.100.0 +anyio==4.13.0 +argcomplete==3.6.3 +attrs==26.1.0 +Authlib==1.7.2 +beartype==0.22.9 +beautifulsoup4==4.14.3 +boto3==1.43.6 +botocore==1.43.6 +brotli==1.2.0 +cachetools==7.1.1 +caio==0.9.25 +certifi==2026.4.22 +cffi==2.0.0 +chardet==7.4.3 +charset-normalizer==3.4.7 +click==8.3.3 +click-log==0.4.0 +cohere==5.21.1 +colorama==0.4.6 +Crawl4AI==0.8.6 +cryptography==48.0.0 +cssselect==1.4.0 +cyclopts==4.11.2 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.18.0 +docutils==0.22.4 +email-validator==2.3.0 +eval_type_backport==0.3.1 +exceptiongroup==1.3.1 +executing==2.2.1 +fake-useragent==2.2.0 +fastavro==1.12.2 +fastmcp==3.2.4 +fastuuid==0.14.0 +filelock==3.29.0 +frozenlist==1.8.0 +fsspec==2026.4.0 +genai-prices==0.0.59 +google-auth==2.52.0 +google-genai==2.0.0 +googleapis-common-protos==1.75.0 +greenlet==3.5.0 +griffelib==2.0.2 +groq==1.2.0 +grpcio==1.80.0 +h11==0.16.0 +h2==4.3.0 +hf-xet==1.5.0 +hpack==4.1.0 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==1.14.0 +humanize==4.15.0 +hyperframe==6.1.0 +idna==3.13 +importlib_metadata==8.7.1 +jaraco.classes==3.4.0 +jaraco.context==6.1.2 +jaraco.functools==4.4.0 +Jinja2==3.1.6 +jiter==0.14.0 +jmespath==1.1.0 +joblib==1.5.3 +joserfc==1.6.5 +jsonpath-python==1.1.6 +jsonref==1.1.0 +jsonschema==4.26.0 +jsonschema-path==0.4.6 +jsonschema-specifications==2025.9.1 +keyring==25.7.0 +lark==1.3.1 +logfire==4.32.1 +logfire-api==4.32.1 +lxml==5.4.0 +markdown-it-py==4.2.0 +markdownify==1.2.2 +MarkupSafe==3.0.3 +mcp==1.27.1 +mdurl==0.1.2 +mistralai==2.4.5 +more-itertools==11.0.2 +multidict==6.7.1 +networkx==3.6.1 +nexus-rpc==1.4.0 +nltk==3.9.4 +numpy==2.4.4 +openai==2.36.0 +openapi-pydantic==0.5.1 +opentelemetry-api==1.39.1 +opentelemetry-exporter-otlp-proto-common==1.39.1 +opentelemetry-exporter-otlp-proto-http==1.39.1 +opentelemetry-instrumentation==0.60b1 +opentelemetry-instrumentation-httpx==0.60b1 +opentelemetry-proto==1.39.1 +opentelemetry-sdk==1.39.1 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-util-http==0.60b1 +packaging==25.0 +patchright==1.59.1 +pathable==0.5.0 +pillow==12.2.0 +platformdirs==4.9.6 +playwright==1.59.0 +playwright-stealth==2.0.3 +pocketbase==0.17.1 +prompt_toolkit==3.0.52 +propcache==0.5.2 +protobuf==6.33.6 +psutil==7.2.2 +py-key-value-aio==0.4.4 +pyasn1==0.6.3 +pyasn1_modules==0.4.2 +pycparser==3.0 +pydantic==2.13.4 +pydantic-ai==1.92.0 +pydantic-ai-slim==1.92.0 +pydantic-evals==1.92.0 +pydantic-graph==1.92.0 +pydantic-handlebars==0.1.0 +pydantic-settings==2.14.1 +pydantic_core==2.46.4 +pyee==13.0.1 +Pygments==2.20.0 +PyJWT==2.12.1 +pyOpenSSL==26.2.0 +pyperclip==1.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.2 +python-multipart==0.0.27 +python-telegram-bot==22.7 +pywin32==311 +pywin32-ctypes==0.2.3 +PyYAML==6.0.3 +rank-bm25==0.2.2 +referencing==0.37.0 +regex==2026.4.4 +requests==2.33.1 +rich==15.0.0 +rich-rst==1.3.2 +rpds-py==0.30.0 +rtree==1.4.1 +s3transfer==0.17.0 +scipy==1.17.1 +setuptools==82.0.1 +shapely==2.1.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +snowballstemmer==2.2.0 +soupsieve==2.8.3 +sse-starlette==3.4.2 +starlette==1.0.0 +temporalio==1.27.0 +tenacity==9.1.4 +tiktoken==0.12.0 +tokenizers==0.23.1 +tqdm==4.67.3 +trimesh==4.12.2 +typer==0.25.1 +types-protobuf==6.32.1.20260221 +types-requests==2.33.0.20260508 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +uncalled-for==0.3.2 +unclecode-litellm==1.81.13 +urllib3==2.7.0 +uvicorn==0.46.0 +watchfiles==1.1.1 +wcwidth==0.7.0 +websockets==16.0 +wheel==0.46.3 +wrapt==1.17.3 +xai-sdk==1.12.2 +xxhash==3.7.0 +yarl==1.23.0 +zipp==3.23.1 diff --git a/schemas.py b/schemas.py new file mode 100644 index 0000000..568256b --- /dev/null +++ b/schemas.py @@ -0,0 +1,21 @@ +from typing import Union, Literal +from pydantic import BaseModel, Field + +## File doesn't do anything, its just an outline for the schemas + +class BaseEntry(BaseModel): + title: str = Field(description="The name of the opportunity") + org: str = Field(description="The organisation") + summary:str = Field(description="A 3 sentence summary of what this is") + +class Event(BaseEntry): + type: Literal["event"] = "event" + date_time: str = Field(description="Date and time of the event") + location: str = Field(description="Location of the event") + +class Opportunity(BaseEntry): + type: str = Field(description="The type of opportunity (Open Call, Funding, Residency, etc.)") + deadline: str = Field(description="What is the deadline in the format of dd-mm-yy") + location: str = Field(description="Location of entry") + +EntrySchema = Union[Event, Opportunity] \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..69a026e --- /dev/null +++ b/scraper.py @@ -0,0 +1,29 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator + +async def get_clean_content(url: str): + md_generator = DefaultMarkdownGenerator( + options={ + "ignore_links": True, + "ignore_images": True, + "body_width": 0, + } + ) + + browser_conf = BrowserConfig( + # cdp_url="http://127.0.0.1:9222", # Use your existing Chrome session + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + + run_conf = CrawlerRunConfig( + cache_mode="bypass", + word_count_threshold=10, + excluded_tags=["nav", "script", "style"], + markdown_generator=md_generator, + delay_before_return_html=3.0, + js_code="window.scrollTo(0, document.body.scrollHeight);", + magic=True + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun(url=url, config=run_conf) + return result.markdown \ No newline at end of file