diff --git a/log-work-expense/SKILL.md b/log-work-expense/SKILL.md index 3155988..c8f85b1 100644 --- a/log-work-expense/SKILL.md +++ b/log-work-expense/SKILL.md @@ -16,7 +16,13 @@ Log receipt images and expense details into the `wip.work_expenses` collection i ## Steps -1. **Extract receipt info** — If the user sent an image, use the image tool to read the date, vendor, and amount. If the image tool fails, read the image with the `read` tool and try to extract the info visually. If you cannot confidently read the date, **ask the user** — never guess. If they only provided text, use that. +1. **Extract receipt info** — If the user sent an image, run the OCR script to read the date, vendor, and amount: + +```bash +python3 scripts/ocr_receipt.py /path/to/image.jpg +``` + +The script is located at `~/notes/skills/log-work-expense/scripts/ocr_receipt.py`. It returns JSON with `date`, `vendor`, and `amount` fields. If the output doesn't contain a confident date, **ask the user** — never guess. If they only provided text, use that. 2. **Upload the receipt image to S2** — If the user provided an image, upload it to S2 and get the public URL. If they only provided a file or no image, skip this step. diff --git a/log-work-expense/scripts/ocr_receipt.py b/log-work-expense/scripts/ocr_receipt.py new file mode 100644 index 0000000..2fdcd51 --- /dev/null +++ b/log-work-expense/scripts/ocr_receipt.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""OCR a receipt image via OpenRouter and return structured info.""" +import requests, json, sys, base64 + +API_KEY = "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" +MODEL = "google/gemini-2.5-flash-lite" +IMAGE_PATH = sys.argv[1] + +with open(IMAGE_PATH, "rb") as f: + b64 = base64.b64encode(f.read()).decode() + +resp = requests.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + }, + json={ + "model": MODEL, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": ( + "Extract from this receipt: date, vendor/restaurant name, and total amount. " + "Return ONLY a JSON object with keys: date (YYYY-MM-DD format), vendor, amount. " + "No other text." + ), + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, + }, + ], + } + ], + "max_tokens": 200, + }, +) + +data = resp.json() +if "error" in data: + print(json.dumps({"error": data["error"]}), file=sys.stderr) + sys.exit(1) + +text = data["choices"][0]["message"]["content"].strip() +# Strip markdown code fences if present +if text.startswith("```"): + text = text.split("\n", 1)[-1] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + +try: + result = json.loads(text) + print(json.dumps(result)) +except json.JSONDecodeError: + print(json.dumps({"raw": text}))