From 9e522fc16410bbc62d57a0ebfcf5e8e5d217b716 Mon Sep 17 00:00:00 2001 From: Connor Rhodes Date: Wed, 29 Apr 2026 02:40:45 +0000 Subject: [PATCH] Add vision-analysis skill; move workaround from TOOLS.md; update log-work-expense to skip image analysis --- log-work-expense/SKILL.md | 2 ++ vision-analysis/SKILL.md | 22 ++++++++++++++++++++++ vision-analysis/scripts/analyze.py | 20 ++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 vision-analysis/SKILL.md create mode 100644 vision-analysis/scripts/analyze.py diff --git a/log-work-expense/SKILL.md b/log-work-expense/SKILL.md index e1d231e..6d69bda 100644 --- a/log-work-expense/SKILL.md +++ b/log-work-expense/SKILL.md @@ -16,6 +16,8 @@ Log work expenses into `wip.work_expenses` via `scripts/log_expense.py`. 1. **Classify images** — Run each image through `scripts/classify_image.py ` to determine if it is a `receipt` or `odometer`. The script outputs exactly one word. + ⚠️ **Do NOT analyze receipt/odometer images yourself.** Do not extract merchant names, amounts, item details, or calculate mileage. Only classify and upload. The note field should contain only what the user provides or minimal context — never information you derived from reading the images. + 2. **Determine type and account** — The user provides the expense type and account (e.g. "meals and mileage for LTISD"). Ask if unclear. 3. **Upload images to S2** — Upload all images to S2 and collect the URLs. diff --git a/vision-analysis/SKILL.md b/vision-analysis/SKILL.md new file mode 100644 index 0000000..b3a702f --- /dev/null +++ b/vision-analysis/SKILL.md @@ -0,0 +1,22 @@ +--- +name: vision-analysis +description: Analyze images using a vision model via Python. Use when the built-in image tool fails or when you need to extract text, describe content, or analyze one or more images. Triggers on any request to analyze, read, describe, or extract information from images. +--- + +# Vision Analysis + +Analyze images by calling the OpenRouter API directly via Python. + +## Script + +```bash +uv run --with requests ~/notes/skills/vision-analysis/scripts/analyze.py "" [image2] ... +``` + +- **Prompt** (required): What to extract or analyze from the images. +- **Images** (required): One or more local file paths. + +## Notes + +- The built-in `image` tool does not work with the configured vision model. Always use this script instead. +- Max ~20 images per call (API limit). diff --git a/vision-analysis/scripts/analyze.py b/vision-analysis/scripts/analyze.py new file mode 100644 index 0000000..12123f9 --- /dev/null +++ b/vision-analysis/scripts/analyze.py @@ -0,0 +1,20 @@ +import requests, json, sys, base64 + +API_KEY = "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" +MODEL = "google/gemini-2.5-flash-lite" + +prompt = sys.argv[1] +paths = sys.argv[2:] + +content = [{"type": "text", "text": prompt}] +for p in paths: + with open(p, "rb") as f: + b64 = base64.b64encode(f.read()).decode() + content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}) + +resp = requests.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}, + json={"model": MODEL, "messages": [{"role": "user", "content": content}], "max_tokens": 2000} +) +print(resp.json()["choices"][0]["message"]["content"])