Before we dive in, here's what we'll be using:
- Python for automation and file handling
- groqCloud with llama 3.2 90b vision preview model for image classification
- PIL (Python Imaging Library) for image processing
- Base64 encoding for image data transmission
To install the above dependencies, run the following command in your terminal
pip install groq pillow
If you are like me and have hundreds of screenshots just lying on your desktop with no information, making everything look messy, then you would love something that can organize those screenshots for you. This script will automatically:
I have a really cluttered desktop with random photos or memes, code snippets and images that I take screenshots of. Llama will clean and organize them to make it look from this:
We are using the groq
library, which is OpenAI-compatible and will be used to access our model hosted on groqcloud
.
#! /base/bin/python3
import os
import glob
import base64
from dotenv import load_dotenv # type: ignore
from groq import Groq # type: ignore
import json
import shutil
from PIL import Image
from io import BytesIO
from datetime import datetime
load_dotenv()
We'll take an object-oriented approach and create a class CleanScreenShots
with well-defined responsibilities:
class CleanScreenShots:
DESKTOP_PATH = os.path.join(os.path.expanduser("~"), "Desktop")
ALLOWED_CATEGORIES = ["Meme", "Study", "Documents", "Random", "Images"]
# Generic code coming soon
Our function get_screenshots()
will fetch all the screenshots stored on the desktop. It searches for file names that contain Screenshot
in their name and supports multiple image formats:
def get_screenshots(self):
patterns = [os.path.join(self.DESKTOP_PATH, f"*Screenshot*.{ext}") for ext in ["png", "jpg", "jpeg"]]
screenshots = []
for pattern in patterns:
screenshots.extend(glob.glob(pattern))
return screenshots
The image processing pipeline involves several key steps:
groq
Here's how we handle these requirements:
def compress_resize_image(self, image_path, max_size=2.5 * 1024 * 1024):
img = Image.open(image_path)
img.thumbnail((1920, 1080))
buf = BytesIO()
ext = os.path.splitext(image_path)[1].lower()
if ext in [".jpg", ".jpeg"]:
img.save(buf, format="JPEG", quality=85)
else:
img.save(buf, format="PNG", optimize=True)
return base64.b64encode(buf.getvalue()).decode("utf-8")
def encode_image(self, image_path):
max_size = 2.5 * 1024 * 1024
if os.path.getsize(image_path) > max_size:
return self.compress_resize_image(image_path, max_size)
else:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def get_mime_type(self, image_path):
ext = os.path.splitext(image_path)[1].lower()
if ext == ".png":
return "image/png"
elif ext in [".jpg", ".jpeg"]:
return "image/jpeg"
else:
return "application/octet-stream"
The heart of our script lies in its ability to intelligently categorize images. We use a carefully crafted prompt that instructs the AI model to classify images into specific categories:
The categories are designed to cover most common screenshot types:
- Meme: Funny or satirical images
- Study: Educational content, formulas, notes
- Documents: Text-heavy, information-rich content
- Random: Code snippets, tickets, miscellaneous
- Images: Personal photos and non-meme images
prompt = f"""Categorize the following image into one of these categories:
Meme : Generally images with some satirical text or just funny images.
Study : Math equations, formulas, experiment results.
Documents: PDFs or text-based documents with useful information.
Random: Code snippets, outdated tickets, invitations.
Images: Images of people that are not memes.
Always return only a JSON object in the schema:
{{"category": "value"}}
The current date is {datetime.today().strftime('%Y-%m-%d')}
"""
We use the following configuration to connect with groqCloud and process our images:
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {"url": image_data_url},
},
],
}
]
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
chat_completion = client.chat.completions.create(
messages=messages,
model="llama-3.2-90b-vision-preview",
temperature=1,
max_completion_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
The process_all_screenshots
function serves as the main orchestrator, combining all our components to:
def save_to_folders(self, category, image_path):
allowed = self.ALLOWED_CATEGORIES + [s.lower() for s in self.ALLOWED_CATEGORIES]
print(category)
if category not in allowed:
raise Exception("Cannot save docs for unknown categories")
output_path = os.path.join(self.DESKTOP_PATH, category)
os.makedirs(output_path, exist_ok=True)
output_path = os.path.join(output_path, os.path.basename(image_path))
shutil.move(image_path, output_path)
def process_all_screenshots(self):
screenshots = self.get_screenshots()
assert screenshots, "No screenshots matched, check screenshot names"
k = 0
while k < len(screenshots):
batch = screenshots[k:k+5]
for image_path in batch:
print(f"Processing {image_path}...")
result = json.loads(self.categorize_image(image_path))
category = result["category"]
self.save_to_folders(category, image_path)
if k + 5 < len(screenshots):
proceed = input("Do you wish to continue? (y/n): ").strip().lower()
while proceed not in ["y", "n", "yes", "no"]:
print("Invalid input. Please enter 'y' or 'n'.")
proceed = input("Do you wish to continue? (y/n): ").strip().lower()
if proceed in ["n", "no"]:
break
k += 5
To use this script, simply create an instance of CleanScreenShots
and call process_all_screenshots
:
if __name__ == "__main__":
cs = CleanScreenShots()
cs.process_all_screenshots()
"True AGI is when AI can do a task with 96.012% accuracy that a human can do with 100" — Vibe coder