From e4266315c78fd4d2d21eaae25efd5ba4b255a30a Mon Sep 17 00:00:00 2001
From: jenz <jenz>
Date: Sat, 14 Mar 2026 19:15:38 +0100
Subject: [PATCH] make the image prompt use content from the previous image so
 that they feel more chained together

---
 stoat_retarded_ai_tree/python/main.py | 50 +++++++++++++++++++++------
 1 file changed, 39 insertions(+), 11 deletions(-)
diff --git a/stoat_retarded_ai_tree/python/main.py b/stoat_retarded_ai_tree/python/main.py
index 9ffbe90..fbab09e 100644
--- a/stoat_retarded_ai_tree/python/main.py
+++ b/stoat_retarded_ai_tree/python/main.py
@@ -77,17 +77,22 @@ def check_if_emote_reacted():
         message = response.json()[0]
     except:
         initiate_tree()
-        return None, None
+        return None, None, None
     if 'reactions' in message:
         #print('reactions: ', message['reactions'])
         for key, _ in message['reactions'].items():
-            return key, extract_metrics(message["content"])
-    return None, None
+            attachment = message["attachments"][0]
+            attachment_url = f"https://stoat.unloze.com/autumn/attachments/{attachment['_id']}/tree.png"
+            return key, extract_metrics(message["content"]), attachment_url
+    return None, None, None
 
 def is_custom_emote(emote_key):
     return emote_key.isascii() and emote_key.isalnum()
 
-def send_claude_message(reacted_emote, metrics, custom_emotes_json):
+def send_claude_message(reacted_emote, metrics, custom_emotes_json, attachment_url):
+    prev_image_data = requests.get(attachment_url).content
+    prev_image_base64 = base64.b64encode(prev_image_data).decode("utf-8")
+
     if is_custom_emote(reacted_emote):
         img_response = requests.get("https://stoat.unloze.com/autumn/emojis/" + reacted_emote)
         img_data = img_response.content
@@ -123,11 +128,19 @@ def send_claude_message(reacted_emote, metrics, custom_emotes_json):
             json={
                 "model": "claude-haiku-4-5-20251001",
                 "max_tokens": 1024,
-                "system": """You are managing a chaotic living tree. You receive the tree's current metrics and an emoji reaction, and you must decide how the emoji affects the tree. Invent new metrics freely but only track things numerically. Be silly, chaotic and creative, certain times the sapling or tree can also shrink if the emote would cause damage or other unexpected side effects. The tree has growth phases: sapling (0-500cm), young tree (500-5000cm), mature tree (5000-50000cm), ancient tree (50000cm+). Mention the phase when it changes. Keep the metrics object to a maximum of 24 metrics at all times. When adding a new metric, remove the least interesting or relevant existing one to make room. Always keep height_cm. Also return a "sleep_seconds" field: the cooldown before the next reaction is accepted. Scale it strictly based on height_cm using these boundaries: 0-500cm (sapling): 20-60 seconds, 500-5000cm (young tree): 60-300 seconds, 5000-50000cm (mature tree): 300-900 seconds, 50000cm+ (ancient tree): 900-1800 seconds. Never exceed 1800 seconds. Stay within the range for the current phase. Respond in JSON like: {"metrics": {"height_cm": 42, ...}, "message": "...", "sleep_seconds": 30, "image_prompt": "..."}. The image_prompt should visually depict what the message describes happening to the tree — any effects, damage, mutations or chaos described in the message for example. The image_prompt should depict the tree with the custom emote character or object visually integrated into or interacting with the tree — as if the emote itself is physically there affecting it. When writing the image_prompt, translate height_cm into a visual size description — for example 1-50cm is a tiny sprout, 50-200cm is a small sapling, 200-500cm is a knee-height bush-like tree, etc. Use descriptive size language rather than raw numbers. When writing the image_prompt, consider the current metrics values to add visual detail — for example high glow_intensity means the tree glows brightly, high chaotic_energy means it looks wild and unstable, low health_points means it looks sickly etc.""",
+                "system": """You are managing a living tree. You receive the tree's current metrics and an emoji reaction, and you must decide how the emoji affects the tree. Invent new metrics freely but only track things numerically. Be silly, chaotic and creative. Based on the description of what happened to the tree, decide whether the tree grows or shrinks and by how much. Growth and shrink amounts should be proportional to the current phase — in the sapling phase typical changes are 1-50cm, young tree 1-100cm, mature tree 1-1000cm, ancient tree 1-10000cm but you can deviate. Not all emotes should cause growth — damaging, violent or destructive emotes should typically cause the tree to shrink. The tree has growth phases: sapling (0-500cm), young tree (500-5000cm), mature tree (5000-50000cm), ancient tree (50000cm+). Mention the phase when it changes. Keep the metrics object to a maximum of 24 metrics at all times. When adding a new metric, remove the least interesting or relevant existing one to make room. Always keep height_cm. Also return a "sleep_seconds" field: the cooldown before the next reaction is accepted. Scale it strictly based on height_cm using these boundaries: 0-500cm (sapling): 20-60 seconds, 500-5000cm (young tree): 60-300 seconds, 5000-50000cm (mature tree): 300-900 seconds, 50000cm+ (ancient tree): 900-1800 seconds. Never exceed 1800 seconds. Stay within the range for the current phase. Respond in JSON like: {"metrics": {"height_cm": 42, ...}, "message": "...", "sleep_seconds": 30, "image_prompt": "..."}. The image_prompt should visually depict what the message describes happening to the tree — any effects, damage, mutations or chaos described in the message for example. The image_prompt should depict the tree with the custom emote character or object visually integrated into or interacting with the tree — as if the emote itself is physically there affecting it. When writing the image_prompt, translate height_cm into a visual size description — for example 1-50cm is a tiny sprout, 50-200cm is a small sapling, 200-500cm is a knee-height bush-like tree, etc. Use descriptive size language rather than raw numbers. When writing the image_prompt, consider the current metrics values to add visual detail — for example high glow_intensity means the tree glows brightly, high chaotic_energy means it looks wild and unstable, low health_points means it looks sickly etc.""",
                 "messages": [
                     {
                         "role": "user",
                         "content": [
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/webp",
+                                    "data": prev_image_base64
+                                }
+                            },
                             {
                                 "type": "image",
                                 "source": {
@@ -138,7 +151,7 @@ def send_claude_message(reacted_emote, metrics, custom_emotes_json):
                             },
                             {
                                 "type": "text",
-                                "text": f"Current metrics: {metrics}\nCustom emote name (use this as a hint to identify the emote, ignore if empty): '{emote_name}'.\nDescribe what you see in the image, then decide what it does to the tree."
+                                "text": f"The first image is the previous tree state. The second image is the custom emote that was just reacted. Carry over relevant visual elements from the previous tree into the new image_prompt so the images feel chained together, though some elements can fade over time. Current metrics: {metrics}\nCustom emote name (use this as a hint to identify the emote, ignore if empty): '{emote_name}'.\nDescribe what you see in both images, then decide what the emote does to the tree."
                             }
                         ]
                     }
@@ -156,9 +169,24 @@ def send_claude_message(reacted_emote, metrics, custom_emotes_json):
             json={
                 "model": "claude-haiku-4-5-20251001",
                 "max_tokens": 1024,
-                "system": """You are managing a chaotic living tree. You receive the tree's current metrics and an emoji reaction, and you must decide how the emoji affects the tree. Invent new metrics freely but only track things numerically. Be silly, chaotic and creative, certain times the sapling or tree can also shrink if the emote would cause damage or other unexpected side effects. The tree has growth phases: sapling (0-500cm), young tree (500-5000cm), mature tree (5000-50000cm), ancient tree (50000cm+). Mention the phase when it changes. Keep the metrics object to a maximum of 24 metrics at all times. When adding a new metric, remove the least interesting or relevant existing one to make room. Always keep height_cm. Also return a "sleep_seconds" field: the cooldown before the next reaction is accepted. Scale it strictly based on height_cm using these boundaries: 0-500cm (sapling): 20-60 seconds, 500-5000cm (young tree): 60-300 seconds, 5000-50000cm (mature tree): 300-900 seconds, 50000cm+ (ancient tree): 900-1800 seconds. Never exceed 1800 seconds. Stay within the range for the current phase. Respond in JSON like: {"metrics": {"height_cm": 42, ...}, "message": "...", "sleep_seconds": 30, "image_prompt": "..."}. The image_prompt should visually depict what the message describes happening to the tree — any effects, damage, mutations or chaos described in the message and caused by the emoji reaction. When writing the image_prompt, translate height_cm into a visual size description — for example 1-50cm is a tiny sprout, 50-200cm is a small sapling, 200-500cm is a knee-height bush-like tree, etc. Use descriptive size language rather than raw numbers. When writing the image_prompt, consider the current metrics values to add visual detail — for example high glow_intensity means the tree glows brightly, high chaotic_energy means it looks wild and unstable, low health_points means it looks sickly etc.""",
+                "system": """You are managing a chaotic living tree. You receive the tree's current metrics and an emoji reaction, and you must decide how the emoji affects the tree. Invent new metrics freely but only track things numerically. Be silly, chaotic and creative. Based on the description of what happened to the tree, decide whether the tree grows or shrinks and by how much. Growth and shrink amounts should be proportional to the current phase — in the sapling phase typical changes are 1-50cm, young tree 1-100cm, mature tree 1-1000cm, ancient tree 1-10000cm but you can deviate. Not all emotes should cause growth — damaging, violent or destructive emotes should typically cause the tree to shrink. The tree has growth phases: sapling (0-500cm), young tree (500-5000cm), mature tree (5000-50000cm), ancient tree (50000cm+). Mention the phase when it changes. Keep the metrics object to a maximum of 24 metrics at all times. When adding a new metric, remove the least interesting or relevant existing one to make room. Always keep height_cm. Also return a "sleep_seconds" field: the cooldown before the next reaction is accepted. Scale it strictly based on height_cm using these boundaries: 0-500cm (sapling): 20-60 seconds, 500-5000cm (young tree): 60-300 seconds, 5000-50000cm (mature tree): 300-900 seconds, 50000cm+ (ancient tree): 900-1800 seconds. Never exceed 1800 seconds. Stay within the range for the current phase. Respond in JSON like: {"metrics": {"height_cm": 42, ...}, "message": "...", "sleep_seconds": 30, "image_prompt": "..."}. The image_prompt should visually depict what the message describes happening to the tree — any effects, damage, mutations or chaos described in the message and caused by the emoji reaction. When writing the image_prompt, translate height_cm into a visual size description — for example 1-50cm is a tiny sprout, 50-200cm is a small sapling, 200-500cm is a knee-height bush-like tree, etc. Use descriptive size language rather than raw numbers. When writing the image_prompt, consider the current metrics values to add visual detail — for example high glow_intensity means the tree glows brightly, high chaotic_energy means it looks wild and unstable, low health_points means it looks sickly etc.""",
                 "messages": [
-                        {"role": "user", "content": f"Current metrics: {metrics}\nEmoji reacted: {reacted_emote}\nWhat does this emoji do to the tree?"}
+                        {"role": "user", "content": [
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/webp",
+                                    "data": prev_image_base64
+                                }
+                            },
+                            {
+                                "type": "text",
+                                "text": f"The image is the previous tree state. Carry over relevant visual elements from the previous tree into the new image_prompt so the images feel chained together, though some elements can fade over time. Current metrics: {metrics}\nEmoji reacted: {reacted_emote}\nDescribe what you see in the image and what the emote does to the tree."
+                            }
+                            ]
+                        }
+
                 ]
             }
         )
@@ -169,7 +197,7 @@ def send_claude_message(reacted_emote, metrics, custom_emotes_json):
     start = raw_text.find("{")
     end = raw_text.rfind("}") + 1
     parsed = json.loads(raw_text[start:end])
-    print(parsed)
+    #print(parsed)
     return parsed 
 
 def replicate_fetch_image(image_prompt):
@@ -183,10 +211,10 @@ def replicate_fetch_image(image_prompt):
 def main():
     custom_emotes_json = get_custom_emotes()
     while True:
-        reacted_emote, metrics = check_if_emote_reacted()
+        reacted_emote, metrics, attachment_url = check_if_emote_reacted()
         print(reacted_emote, metrics)
         if reacted_emote:
-            claude_response = send_claude_message(reacted_emote, metrics, custom_emotes_json)
+            claude_response = send_claude_message(reacted_emote, metrics, custom_emotes_json, attachment_url)
             replicate_image_response = replicate_fetch_image(claude_response["image_prompt"])
 
             sleep_seconds = claude_response.get("sleep_seconds", 30)