Enforce 5MB per-image limit when converting images for language models (#45313)
## Problem When users paste or drag large images into the agent panel, the encoded payload can exceed upstream provider limits (e.g., Anthropic's 5MB per-image limit), causing API errors. ## Solution Enforce a default 5MB limit on encoded PNG bytes in `LanguageModelImage::from_image`: 1. Apply existing Anthropic dimension limits first (1568px max in either dimension) 2. Iteratively downscale by ~15% per pass until the encoded PNG is under 5MB 3. Return `None` if the image can't be shrunk within 8 passes (fail-safe) The limit is enforced at the `LanguageModelImage` conversion layer, which is the choke point for all image ingestion paths (agent panel paste/drag, file mentions, text threads, etc.). ## Future Work The 5MB limit is a conservative default. Provider-specific limits can be introduced later by adding a `from_image_with_constraints` API. ## Testing Added a regression test that: 1. Generates a noisy 4096x4096 PNG (guaranteed >5MB) 2. Converts it via `LanguageModelImage::from_image` 3. Asserts the result is ≤5MB and was actually downscaled --- **Note:** This PR builds on #45312 (prompt store fail-open fix). Please merge that first. cc @rtfeldman --------- Co-authored-by: Zed Zippy <234243425+zed-zippy[bot]@users.noreply.github.com>
This commit is contained in:
@@ -8,6 +8,7 @@ use gpui::{
|
||||
App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, SharedString, Size, Task,
|
||||
point, px, size,
|
||||
};
|
||||
use image::GenericImageView as _;
|
||||
use image::codecs::png::PngEncoder;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use util::ResultExt;
|
||||
@@ -80,6 +81,16 @@ impl std::fmt::Debug for LanguageModelImage {
|
||||
/// Anthropic wants uploaded images to be smaller than this in both dimensions.
|
||||
const ANTHROPIC_SIZE_LIMIT: f32 = 1568.;
|
||||
|
||||
/// Default per-image hard limit (in bytes) for the encoded image payload we send upstream.
|
||||
///
|
||||
/// NOTE: `LanguageModelImage.source` is base64-encoded PNG bytes (without the `data:` prefix).
|
||||
/// This limit is enforced on the encoded PNG bytes *before* base64 encoding.
|
||||
const DEFAULT_IMAGE_MAX_BYTES: usize = 5 * 1024 * 1024;
|
||||
|
||||
/// Conservative cap on how many times we'll attempt to shrink/re-encode an image to fit
|
||||
/// `DEFAULT_IMAGE_MAX_BYTES`.
|
||||
const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
|
||||
|
||||
impl LanguageModelImage {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
@@ -112,29 +123,62 @@ impl LanguageModelImage {
|
||||
let height = dynamic_image.height();
|
||||
let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
|
||||
|
||||
let base64_image = {
|
||||
if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
|
||||
|| image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
|
||||
{
|
||||
let new_bounds = ObjectFit::ScaleDown.get_bounds(
|
||||
gpui::Bounds {
|
||||
origin: point(px(0.0), px(0.0)),
|
||||
size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
|
||||
},
|
||||
image_size,
|
||||
);
|
||||
let resized_image = dynamic_image.resize(
|
||||
new_bounds.size.width.into(),
|
||||
new_bounds.size.height.into(),
|
||||
image::imageops::FilterType::Triangle,
|
||||
);
|
||||
// First apply any provider-specific dimension constraints we know about (Anthropic).
|
||||
let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
|
||||
|| image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
|
||||
{
|
||||
let new_bounds = ObjectFit::ScaleDown.get_bounds(
|
||||
gpui::Bounds {
|
||||
origin: point(px(0.0), px(0.0)),
|
||||
size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
|
||||
},
|
||||
image_size,
|
||||
);
|
||||
dynamic_image.resize(
|
||||
new_bounds.size.width.into(),
|
||||
new_bounds.size.height.into(),
|
||||
image::imageops::FilterType::Triangle,
|
||||
)
|
||||
} else {
|
||||
dynamic_image
|
||||
};
|
||||
|
||||
encode_as_base64(data, resized_image)
|
||||
} else {
|
||||
encode_as_base64(data, dynamic_image)
|
||||
// Then enforce a default per-image size cap on the encoded PNG bytes.
|
||||
//
|
||||
// We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
|
||||
// The upstream provider limit we want to respect is effectively on the binary image
|
||||
// payload size, so we enforce against the encoded PNG bytes before base64 encoding.
|
||||
let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
|
||||
for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
|
||||
if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
|
||||
break;
|
||||
}
|
||||
|
||||
// Scale down geometrically to converge quickly. We don't know the final PNG size
|
||||
// as a function of pixels, so we iteratively shrink.
|
||||
let (w, h) = processed_image.dimensions();
|
||||
if w <= 1 || h <= 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Shrink by ~15% each pass (0.85). This is a compromise between speed and
|
||||
// preserving image detail.
|
||||
let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
|
||||
let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
|
||||
|
||||
processed_image =
|
||||
processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
encoded_png = encode_png_bytes(&processed_image).log_err()?;
|
||||
}
|
||||
.log_err()?;
|
||||
|
||||
if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
|
||||
// Still too large after multiple passes; treat as non-convertible for now.
|
||||
// (Provider-specific handling can be introduced later.)
|
||||
return None;
|
||||
}
|
||||
|
||||
// Now base64 encode the PNG bytes.
|
||||
let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
|
||||
|
||||
// SAFETY: The base64 encoder should not produce non-UTF8.
|
||||
let source = unsafe { String::from_utf8_unchecked(base64_image) };
|
||||
@@ -164,21 +208,20 @@ impl LanguageModelImage {
|
||||
}
|
||||
}
|
||||
|
||||
fn encode_as_base64(data: Arc<Image>, image: image::DynamicImage) -> Result<Vec<u8>> {
|
||||
fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
|
||||
let mut png = Vec::new();
|
||||
image.write_with_encoder(PngEncoder::new(&mut png))?;
|
||||
Ok(png)
|
||||
}
|
||||
|
||||
fn encode_bytes_as_base64(bytes: &[u8]) -> Result<Vec<u8>> {
|
||||
let mut base64_image = Vec::new();
|
||||
{
|
||||
let mut base64_encoder = EncoderWriter::new(
|
||||
Cursor::new(&mut base64_image),
|
||||
&base64::engine::general_purpose::STANDARD,
|
||||
);
|
||||
if data.format() == ImageFormat::Png {
|
||||
base64_encoder.write_all(data.bytes())?;
|
||||
} else {
|
||||
let mut png = Vec::new();
|
||||
image.write_with_encoder(PngEncoder::new(&mut png))?;
|
||||
|
||||
base64_encoder.write_all(png.as_slice())?;
|
||||
}
|
||||
base64_encoder.write_all(bytes)?;
|
||||
}
|
||||
Ok(base64_image)
|
||||
}
|
||||
@@ -417,6 +460,71 @@ pub struct LanguageModelResponseMessage {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use base64::Engine as _;
|
||||
use gpui::TestAppContext;
|
||||
use image::ImageDecoder as _;
|
||||
|
||||
fn base64_to_png_bytes(base64_png: &str) -> Vec<u8> {
|
||||
base64::engine::general_purpose::STANDARD
|
||||
.decode(base64_png.as_bytes())
|
||||
.expect("base64 should decode")
|
||||
}
|
||||
|
||||
fn png_dimensions(png_bytes: &[u8]) -> (u32, u32) {
|
||||
let decoder =
|
||||
image::codecs::png::PngDecoder::new(Cursor::new(png_bytes)).expect("png should decode");
|
||||
decoder.dimensions()
|
||||
}
|
||||
|
||||
fn make_noisy_png_bytes(width: u32, height: u32) -> Vec<u8> {
|
||||
// Create an RGBA image with per-pixel variance to avoid PNG compressing too well.
|
||||
let mut img = image::RgbaImage::new(width, height);
|
||||
for y in 0..height {
|
||||
for x in 0..width {
|
||||
let r = ((x ^ y) & 0xFF) as u8;
|
||||
let g = ((x.wrapping_mul(31) ^ y.wrapping_mul(17)) & 0xFF) as u8;
|
||||
let b = ((x.wrapping_mul(131) ^ y.wrapping_mul(7)) & 0xFF) as u8;
|
||||
img.put_pixel(x, y, image::Rgba([r, g, b, 0xFF]));
|
||||
}
|
||||
}
|
||||
|
||||
let mut out = Vec::new();
|
||||
image::DynamicImage::ImageRgba8(img)
|
||||
.write_with_encoder(PngEncoder::new(&mut out))
|
||||
.expect("png encoding should succeed");
|
||||
out
|
||||
}
|
||||
|
||||
#[gpui::test]
|
||||
async fn test_from_image_downscales_to_default_5mb_limit(cx: &mut TestAppContext) {
|
||||
// Pick a size that reliably produces a PNG > 5MB when filled with noise.
|
||||
// If this fails (image is too small), bump dimensions.
|
||||
let original_png = make_noisy_png_bytes(4096, 4096);
|
||||
assert!(
|
||||
original_png.len() > DEFAULT_IMAGE_MAX_BYTES,
|
||||
"precondition failed: noisy PNG must exceed DEFAULT_IMAGE_MAX_BYTES"
|
||||
);
|
||||
|
||||
let image = gpui::Image::from_bytes(ImageFormat::Png, original_png);
|
||||
let lm_image = cx
|
||||
.update(|cx| LanguageModelImage::from_image(Arc::new(image), cx))
|
||||
.await
|
||||
.expect("image conversion should succeed");
|
||||
|
||||
let encoded_png = base64_to_png_bytes(lm_image.source.as_ref());
|
||||
assert!(
|
||||
encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
|
||||
"expected encoded PNG <= DEFAULT_IMAGE_MAX_BYTES, got {} bytes",
|
||||
encoded_png.len()
|
||||
);
|
||||
|
||||
// Ensure we actually downscaled in pixels (not just re-encoded).
|
||||
let (w, h) = png_dimensions(&encoded_png);
|
||||
assert!(
|
||||
w < 4096 || h < 4096,
|
||||
"expected image to be downscaled in at least one dimension; got {w}x{h}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_language_model_tool_result_content_deserialization() {
|
||||
|
||||
Reference in New Issue
Block a user