adrian-altner.com/scripts/vision.ts

#!/usr/bin/env -S node --experimental-strip-types

import { execFile } from "node:child_process";
import { readFile, writeFile } from "node:fs/promises";
import { relative, resolve } from "node:path";
import { fileURLToPath } from "node:url";
import { promisify } from "node:util";
import Anthropic from "@anthropic-ai/sdk";
import { consola } from "consola";
import {
  getImagesMissingMetadata,
  getMetadataPathForImage,
  getPhotoAbsolutePath,
  getPhotoDirectories,
  PHOTOS_DIRECTORY,
} from "../src/lib/photo-albums.ts";

const execFileAsync = promisify(execFile);

/**
 * Define the directory where the images are located.
 */
const PHOTOS_DIR = PHOTOS_DIRECTORY;

/**
 * Instantiate the Anthropic client.
 */
let anthropic: Anthropic | undefined;

function getAnthropicClient(): Anthropic {
  anthropic ??= new Anthropic({ maxRetries: 0 });
  return anthropic;
}

function assertRequiredEnvironment(): void {
  if (!process.env.ANTHROPIC_API_KEY) {
    throw new Error(
      "Missing ANTHROPIC_API_KEY. `pnpm run vision` loads `.env.local` automatically. If you run the script directly, use `node --env-file=.env.local --experimental-strip-types scripts/vision.ts`.",
    );
  }
}

/**
 * Represents the metadata of an image in the Exif format.
 */
export interface ExifMetadata {
  SourceFile: string;
  FileName: string;
  Model: string;
  FNumber: number;
  FocalLength: string;
  ExposureTime: string;
  ISO: number;
  DateTimeOriginal: string;
  LensModel: string;
  GPSPosition?: string;
  GPSLatitude?: string;
  GPSLongitude?: string;
}

/**
 * Represents the result of the AI analysis.
 */
export interface VisionAIResult {
  title_ideas: string[];
  description: string;
  tags: string[];
}

/**
 * Represents the final metadata suggestion for an image.
 */
export interface ImageMetadataSuggestion {
  id: string;
  title: string[];
  image: string;
  alt: string;
  location: string;
  date: string;
  tags: string[];
  exif: {
    camera: string;
    lens: string;
    aperture: string;
    iso: string;
    focal_length: string;
    shutter_speed: string;
  };
}

interface VisionCliOptions {
  refresh: boolean;
  photosDirectory?: string;
  visionConcurrency: number;
  visionMaxRetries: number;
  visionBaseBackoffMs: number;
}

function parseCliOptions(argv: string[]): VisionCliOptions {
  const getNumericOption = (name: string, fallback: number): number => {
    const prefix = `--${name}=`;
    const rawValue = argv
      .find((arg) => arg.startsWith(prefix))
      ?.slice(prefix.length);
    const parsed = Number.parseInt(rawValue ?? "", 10);
    return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
  };

  const envConcurrency = Number.parseInt(
    process.env.VISION_CONCURRENCY ?? "",
    10,
  );
  const envMaxRetries = Number.parseInt(
    process.env.VISION_MAX_RETRIES ?? "",
    10,
  );
  const envBaseBackoffMs = Number.parseInt(
    process.env.VISION_BASE_BACKOFF_MS ?? "",
    10,
  );
  const nonFlagArgs = argv.filter((arg) => !arg.startsWith("--"));

  return {
    refresh: argv.includes("--refresh"),
    photosDirectory: resolve(nonFlagArgs[0] ?? PHOTOS_DIR),
    visionConcurrency: getNumericOption(
      "concurrency",
      Number.isFinite(envConcurrency) && envConcurrency > 0
        ? envConcurrency
        : 2,
    ),
    visionMaxRetries: getNumericOption(
      "retries",
      Number.isFinite(envMaxRetries) && envMaxRetries > 0 ? envMaxRetries : 8,
    ),
    visionBaseBackoffMs: getNumericOption(
      "backoff-ms",
      Number.isFinite(envBaseBackoffMs) && envBaseBackoffMs > 0
        ? envBaseBackoffMs
        : 1500,
    ),
  };
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

function isRateLimitError(error: unknown): boolean {
  return error instanceof Anthropic.RateLimitError;
}

function extractRetryAfterMs(error: unknown): number | null {
  if (!(error instanceof Anthropic.RateLimitError)) {
    return null;
  }

  const retryAfter = error.headers?.get("retry-after");
  if (retryAfter) {
    const seconds = Number.parseFloat(retryAfter);
    if (Number.isFinite(seconds) && seconds > 0) {
      return Math.ceil(seconds * 1000);
    }
  }

  return null;
}

async function mapWithConcurrency<T, R>(
  values: T[],
  concurrency: number,
  mapper: (value: T, index: number) => Promise<R>,
): Promise<R[]> {
  if (values.length === 0) {
    return [];
  }

  const results: R[] = new Array(values.length);
  const workerCount = Math.max(1, Math.min(concurrency, values.length));
  let cursor = 0;

  const workers = Array.from({ length: workerCount }, async () => {
    while (true) {
      const currentIndex = cursor;
      cursor += 1;

      if (currentIndex >= values.length) {
        return;
      }

      const value = values[currentIndex];
      if (typeof value === "undefined") {
        continue;
      }

      results[currentIndex] = await mapper(value, currentIndex);
    }
  });

  await Promise.all(workers);
  return results;
}

/**
 * Get all images that don't have a JSON file and therefore need to be processed.
 */
export async function getImagesToProcess(
  photosDirectory = PHOTOS_DIR,
  options: Pick<VisionCliOptions, "refresh"> = { refresh: false },
): Promise<string[]> {
  const relativeImagePaths = options.refresh
    ? (await getPhotoDirectories(photosDirectory)).flatMap(
        (directory) => directory.imagePaths,
      )
    : await getImagesMissingMetadata(photosDirectory);

  consola.info(
    options.refresh
      ? `Refreshing ${relativeImagePaths.length} ${relativeImagePaths.length === 1 ? "image" : "images"} with metadata sidecars`
      : `Found ${relativeImagePaths.length} ${relativeImagePaths.length === 1 ? "image" : "images"} without metadata`,
  );

  return relativeImagePaths.map((imagePath) =>
    getPhotoAbsolutePath(imagePath, photosDirectory),
  );
}

/**
 * Extracts the EXIF metadata from an image file.
 * @param imagePath - The path to the image file.
 *
 * @returns A promise that resolves to the extracted EXIF metadata.
 */
export async function extractExifMetadata(
  imagePath: string,
): Promise<ExifMetadata> {
  /// Check if `exiftool` is installed.
  try {
    await execFileAsync("exiftool", ["--version"]);
  } catch (_error) {
    consola.error(
      "exiftool is not installed. Please run `brew install exiftool`.",
    );
    process.exit(1);
  }

  /// Extract the metadata
  const { stdout } = await execFileAsync("exiftool", ["-j", imagePath]);
  const output = JSON.parse(stdout) as ExifMetadata[];

  if (!output[0]) {
    throw new Error(`No EXIF metadata found for ${imagePath}.`);
  }

  return output[0];
}

/**
 * Encodes an image file to base64.
 * @param imagePath - The path to the image file.
 * @returns A Promise that resolves to the base64 encoded image.
 */
async function base64EncodeImage(imagePath: string): Promise<string> {
  const buffer = await readFile(imagePath);
  return buffer.toString("base64");
}

const VISION_TOOL = {
  name: "vision_response",
  description: "Return the vision analysis of the image.",
  input_schema: {
    type: "object" as const,
    additionalProperties: false,
    properties: {
      title_ideas: { type: "array", items: { type: "string" } },
      description: { type: "string" },
      tags: { type: "array", items: { type: "string" } },
    },
    required: ["title_ideas", "description", "tags"],
  },
};

/**
 * Generates image description, title suggestions and tags using AI.
 *
 * @param metadata - The metadata of the image.
 * @returns A Promise that resolves to a VisionAIResult object containing the generated image description, title suggestions, and tags.
 */
async function generateImageDescriptionTitleSuggestionsAndTags(
  metadata: ExifMetadata,
  options: Pick<VisionCliOptions, "visionMaxRetries" | "visionBaseBackoffMs">,
): Promise<VisionAIResult> {
  /// Base64 encode the image in order to pass it to the API
  const encodedImage = await base64EncodeImage(metadata.SourceFile);

  const prompt =
    "Create an accurate and detailed description of this image that would also work as an alt text. The alt text should not contain words like image, photograph, illustration or such. Describe the scene as it is. Also come up with 5 title suggestions for this image. At last suggest 5 tags that suit the image description. These tags should be single words only. Identify the main subject or theme and make sure to put the according tag first. Return the description, the title suggestions and tags.";

  let lastError: unknown;

  for (let attempt = 0; attempt <= options.visionMaxRetries; attempt += 1) {
    try {
      const response = await getAnthropicClient().messages.create({
        model: "claude-opus-4-6",
        max_tokens: 2048,
        tools: [VISION_TOOL],
        tool_choice: { type: "tool", name: "vision_response" },
        messages: [
          {
            role: "user",
            content: [
              {
                type: "image",
                source: {
                  type: "base64",
                  media_type: "image/jpeg",
                  data: encodedImage,
                },
              },
              { type: "text", text: prompt },
            ],
          },
        ],
      });

      const toolUseBlock = response.content.find((b) => b.type === "tool_use");
      if (!toolUseBlock || toolUseBlock.type !== "tool_use") {
        throw new Error(
          `No tool use response from AI for ${metadata.SourceFile}.`,
        );
      }

      const parsedResponse = toolUseBlock.input as VisionAIResult;

      if (
        parsedResponse.title_ideas.length === 0 ||
        parsedResponse.description.length === 0 ||
        parsedResponse.tags.length === 0
      ) {
        throw new Error(
          `Incomplete vision response for ${metadata.SourceFile}.`,
        );
      }

      return parsedResponse;
    } catch (error) {
      lastError = error;
      if (!isRateLimitError(error) || attempt >= options.visionMaxRetries) {
        break;
      }

      const retryAfterMs = extractRetryAfterMs(error);
      const exponentialBackoffMs = options.visionBaseBackoffMs * 2 ** attempt;
      const jitterMs = Math.floor(Math.random() * 350);
      const waitMs =
        Math.max(retryAfterMs ?? 0, exponentialBackoffMs) + jitterMs;
      const relativeSourcePath = relative(process.cwd(), metadata.SourceFile);
      const nextAttempt = attempt + 1;
      consola.warn(
        `Rate limit for ${relativeSourcePath}. Retry ${nextAttempt}/${options.visionMaxRetries} in ${Math.ceil(waitMs / 1000)}s...`,
      );
      await sleep(waitMs);
    }
  }

  throw lastError;
}

function ensureVisionCanRun(imagesToProcess: string[]): void {
  if (imagesToProcess.length === 0) {
    return;
  }

  assertRequiredEnvironment();
}

function getLocationFromExif(exifData: ExifMetadata): string {
  if (exifData.GPSPosition) {
    return exifData.GPSPosition;
  }

  if (exifData.GPSLatitude && exifData.GPSLongitude) {
    return `${exifData.GPSLatitude}, ${exifData.GPSLongitude}`;
  }

  return "";
}

/**
 * Merges the metadata from EXIF data and vision data to create an ImageMetadataSuggestion object.
 * @param exifData - The EXIF metadata of the image.
 * @param visionData - The vision AI result data of the image.
 * @returns The merged ImageMetadataSuggestion object.
 */
export function mergeMetaAndVisionData(
  exifData: ExifMetadata,
  visionData: VisionAIResult,
): ImageMetadataSuggestion {
  const [date] = exifData.DateTimeOriginal.split(" ");

  if (!date) {
    throw new Error(`Missing original date for ${exifData.SourceFile}.`);
  }

  return {
    id: exifData.FileName.replace(".jpg", ""),
    title: visionData.title_ideas,
    image: `./${exifData.FileName}`,
    alt: visionData.description,
    location: getLocationFromExif(exifData),
    date: date.replaceAll(":", "-"),
    tags: visionData.tags,
    exif: {
      camera: exifData.Model,
      lens: exifData.LensModel,
      aperture: exifData.FNumber.toString(),
      iso: exifData.ISO.toString(),
      focal_length: exifData.FocalLength.replace(" mm", ""),
      shutter_speed: exifData.ExposureTime,
    },
  };
}

/**
 * Writes the given image metadata to a JSON file.
 * @param imageMetadata - The image metadata to be written.
 * @returns A Promise that resolves when the JSON file is written successfully.
 */
async function writeToJsonFile(
  imageMetadata: ImageMetadataSuggestion,
  imagePath: string,
  photosDirectory: string,
): Promise<void> {
  const relativeImagePath = relative(photosDirectory, imagePath);
  const jsonPath = getMetadataPathForImage(relativeImagePath, photosDirectory);
  const json = JSON.stringify(imageMetadata, null, 2);
  await writeFile(jsonPath, json);
}

/**
 * Main.
 */
async function main() {
  consola.start("Checking for images to process...");
  const cliOptions = parseCliOptions(process.argv.slice(2));
  const photosDirectory = cliOptions.photosDirectory ?? PHOTOS_DIR;

  /// Load all images that don't have a JSON file.
  const images = await getImagesToProcess(photosDirectory, cliOptions);

  if (images.length === 0) {
    consola.success(
      cliOptions.refresh
        ? "No images found to refresh."
        : "No images require metadata.",
    );
    return;
  }

  consola.info(
    `Vision settings: concurrency=${cliOptions.visionConcurrency}, retries=${cliOptions.visionMaxRetries}, backoff=${cliOptions.visionBaseBackoffMs}ms`,
  );

  ensureVisionCanRun(images);

  /// Extract EXIF metadata from these images.
  const exifData = await mapWithConcurrency(
    images,
    8,
    async (imagePath, index) => {
      consola.info(`Extracting EXIF ${index + 1}/${images.length}...`);
      return await extractExifMetadata(imagePath);
    },
  );

  /// Determine the image description, title suggestions and tags for each image with AI.
  const visionData = await mapWithConcurrency(
    exifData,
    cliOptions.visionConcurrency,
    async (exifEntry, index) => {
      consola.info(`Generating AI metadata ${index + 1}/${exifData.length}...`);
      return await generateImageDescriptionTitleSuggestionsAndTags(
        exifEntry,
        cliOptions,
      );
    },
  );

  /// Merge the EXIF and Vision data to create the final metadata suggestion.
  const imageData = exifData.map((e, i) => {
    const currentVisionData = visionData[i];

    if (!currentVisionData) {
      throw new Error(`Missing vision data for ${e.SourceFile}.`);
    }

    return mergeMetaAndVisionData(e, currentVisionData);
  });

  /// Write the metadata to JSON files.
  await mapWithConcurrency(imageData, 8, async (imageMetadata, index) => {
    const sourceFile = exifData[index]?.SourceFile;

    if (!sourceFile) {
      throw new Error(`Missing source file for ${imageMetadata.id}.`);
    }

    await writeToJsonFile(imageMetadata, sourceFile, photosDirectory);
    consola.info(`Wrote metadata ${index + 1}/${imageData.length}.`);
  });

  consola.success("All images processed successfully.");
}

if (process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1]) {
  try {
    await main();
  } catch (error) {
    consola.error(error);
    process.exit(1);
  }
}