diff --git a/server/imageproc/images.go b/server/imageproc/images.go index f485bbea..9d766b9c 100644 --- a/server/imageproc/images.go +++ b/server/imageproc/images.go @@ -42,7 +42,7 @@ func min(a, b int) int { return b } -func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { +func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { targetWidth := clip(imageSize.X, tileSize, canvasSize.X) targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) @@ -62,7 +62,7 @@ func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) im return image.Point{w, h} } -func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { +func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles) possibleCanvasSizes := []image.Point{} for _, pta := range possibleTileArrangements { @@ -104,11 +104,13 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i selectedScale = minUpscale } - selectedCanvas := possibleCanvasSizes[0] + var selectedCanvas image.Point for n, pcs := range possibleCanvasSizes { if scales[n] == selectedScale { - // choose the largest possible canvas - if pcs.X*pcs.Y > selectedCanvas.X*selectedCanvas.Y { + // choose the smallest possible canvas + if selectedCanvas.X == 0 && selectedCanvas.Y == 0 { + selectedCanvas = pcs + } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y { selectedCanvas = pcs } } @@ -116,7 +118,7 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i return selectedCanvas } -func SplitToTiles(img image.Image, numTilesSize image.Point) []image.Image { +func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X height := b.Max.Y - b.Min.Y @@ -141,10 +143,9 @@ func ResizeImage(img image.Image, outputSize image.Point, maxImageTiles int) (im b := img.Bounds() tileSize := outputSize.Y - canvasSize := GetOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) + canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - - newSize := GetImageSizeFitToCanvas(b.Max, canvasSize, tileSize) + newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) draw.ApproxBiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil) @@ -165,7 +166,7 @@ func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image } func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { - subImages := SplitToTiles(img, aspectRatio) + subImages := splitToTiles(img, aspectRatio) var pixelVals []float32 @@ -218,8 +219,6 @@ func Preprocess(imageData []byte) ([]float32, int, error) { newImage, aspectRatio := ResizeImage(img, outputSize, maxTiles) newImage = PadImage(newImage, outputSize, aspectRatio) - // todo: need to scale (dim) by 1/256 - data := PackImages(newImage, aspectRatio, mean, std) supportedRatios := GetSupportedAspectRatios(maxTiles) var aspectRatioIndex int diff --git a/server/imageproc/images_test.go b/server/imageproc/images_test.go index ce30cfde..702f7555 100644 --- a/server/imageproc/images_test.go +++ b/server/imageproc/images_test.go @@ -1,7 +1,9 @@ package imageproc import ( + "bytes" "image" + "image/png" "reflect" "testing" ) @@ -27,12 +29,12 @@ func testEq(a, b any) bool { } func TestAspectRatios(t *testing.T) { - type AspectCase struct { + type aspectCase struct { MaxTiles int Expected []image.Point } - cases := []AspectCase{ + cases := []aspectCase{ { MaxTiles: 1, Expected: []image.Point{{1, 1}}, @@ -61,14 +63,14 @@ func TestAspectRatios(t *testing.T) { } func TestGetImageSizeFitToCanvas(t *testing.T) { - type ImageSizeCase struct { + type imageSizeCase struct { ImageRect image.Point CanvasRect image.Point TileSize int Expected image.Point } - cases := []ImageSizeCase{ + cases := []imageSizeCase{ { ImageRect: image.Point{400, 400}, CanvasRect: image.Point{640, 480}, @@ -108,7 +110,7 @@ func TestGetImageSizeFitToCanvas(t *testing.T) { } for _, c := range cases { - actual := GetImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize) + actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize) if actual != c.Expected { t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected) @@ -117,19 +119,19 @@ func TestGetImageSizeFitToCanvas(t *testing.T) { } func TestGetOptimalTiledCanvas(t *testing.T) { - type TiledCanvasSizeCase struct { + type tiledCanvasSizeCase struct { ImageSize image.Point MaxImageTiles int TileSize int Expected image.Point } - cases := []TiledCanvasSizeCase{ + cases := []tiledCanvasSizeCase{ { ImageSize: image.Point{1024, 768}, MaxImageTiles: 4, TileSize: 1000, - Expected: image.Point{4000, 1000}, + Expected: image.Point{2000, 1000}, }, { ImageSize: image.Point{1024, 768}, @@ -140,7 +142,7 @@ func TestGetOptimalTiledCanvas(t *testing.T) { } for _, c := range cases { - actual := GetOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize) + actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize) if actual != c.Expected { t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected) @@ -149,13 +151,13 @@ func TestGetOptimalTiledCanvas(t *testing.T) { } func TestSplitToTiles(t *testing.T) { - type SplitCase struct { + type splitCase struct { TestImage image.Image NumTilesSize image.Point Expected []image.Image } - cases := []SplitCase{ + cases := []splitCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), NumTilesSize: image.Point{1, 1}, @@ -182,7 +184,7 @@ func TestSplitToTiles(t *testing.T) { } for _, c := range cases { - actual := SplitToTiles(c.TestImage, c.NumTilesSize) + actual := splitToTiles(c.TestImage, c.NumTilesSize) if len(actual) != len(c.Expected) { t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected)) @@ -197,7 +199,7 @@ func TestSplitToTiles(t *testing.T) { } func TestResize(t *testing.T) { - type ResizeCase struct { + type resizeCase struct { TestImage image.Image OutputSize image.Point MaxImageTiles int @@ -205,7 +207,7 @@ func TestResize(t *testing.T) { ExpectedAspectRatio image.Point } - cases := []ResizeCase{ + cases := []resizeCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), OutputSize: image.Point{100, 100}, @@ -218,7 +220,14 @@ func TestResize(t *testing.T) { OutputSize: image.Point{100, 100}, MaxImageTiles: 2, ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), - ExpectedAspectRatio: image.Point{1, 2}, + ExpectedAspectRatio: image.Point{1, 1}, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), + OutputSize: image.Point{560, 560}, + MaxImageTiles: 4, + ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), + ExpectedAspectRatio: image.Point{1, 1}, }, { TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)), @@ -244,20 +253,20 @@ func TestResize(t *testing.T) { } if actualAspectRatio != c.ExpectedAspectRatio { - t.Errorf("canvas size incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio) + t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio) } } } func TestPad(t *testing.T) { - type PadCase struct { + type padCase struct { TestImage image.Image OutputSize image.Point AspectRatio image.Point Expected image.Image } - cases := []PadCase{ + cases := []padCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)), OutputSize: image.Point{560, 560}, @@ -276,30 +285,79 @@ func TestPad(t *testing.T) { } func TestPackImages(t *testing.T) { - type PackCase struct { - TestImage image.Image - AspectRatio image.Point + type packCase struct { + TestImage image.Image + AspectRatio image.Point + ExpectedVals int } mean := [3]float32{0.48145466, 0.4578275, 0.40821073} std := [3]float32{0.26862954, 0.26130258, 0.27577711} - cases := []PackCase{ + cases := []packCase{ { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), - AspectRatio: image.Point{2, 2}, + TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), + AspectRatio: image.Point{2, 2}, + ExpectedVals: 2 * 2 * 3 * 560 * 560, }, { - TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), - AspectRatio: image.Point{1, 1}, + TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), + AspectRatio: image.Point{1, 1}, + ExpectedVals: 1 * 1 * 3 * 560 * 560, }, { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)), - AspectRatio: image.Point{1, 2}, + TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)), + AspectRatio: image.Point{1, 2}, + ExpectedVals: 1 * 2 * 3 * 560 * 560, }, } for _, c := range cases { - PackImages(c.TestImage, c.AspectRatio, mean, std) + actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std) + if len(actualVals) != c.ExpectedVals { + t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) + } + } +} + +func TestPreprocess(t *testing.T) { + type preprocessCase struct { + TestImage image.Image + ExpectedVals int + ExpectedAspectRatioID int + } + + cases := []preprocessCase{ + { + TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), + ExpectedVals: 0, + ExpectedAspectRatioID: 1, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), + ExpectedVals: 0, + ExpectedAspectRatioID: 6, + }, + } + + for _, c := range cases { + var buf bytes.Buffer + err := png.Encode(&buf, c.TestImage) + if err != nil { + t.Fatal(err) + } + + imgData, aspectRatioID, err := Preprocess(buf.Bytes()) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + if len(imgData) == 0 { + t.Errorf("no image data returned") + } + + if aspectRatioID != c.ExpectedAspectRatioID { + t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) + } } } diff --git a/server/prompt_test.go b/server/prompt_test.go index 26a20027..bd70f154 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -3,6 +3,8 @@ package server import ( "bytes" "context" + "image" + "image/png" "testing" "github.com/google/go-cmp/cmp" @@ -13,18 +15,41 @@ import ( func TestChatPrompt(t *testing.T) { type expect struct { - prompt string - images [][]byte + prompt string + images [][]byte + aspectRatioID int } + tmpl, err := template.Parse(` +{{- if .System }}{{ .System }} {{ end }} +{{- if .Prompt }}{{ .Prompt }} {{ end }} +{{- if .Response }}{{ .Response }} {{ end }}`) + if err != nil { + t.Fatal(err) + } + visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} + mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}} + + img := image.NewRGBA(image.Rect(0, 0, 5, 5)) + var buf bytes.Buffer + + err = png.Encode(&buf, img) + if err != nil { + t.Fatal(err) + } + + imgBuf := buf.Bytes() + cases := []struct { name string + model Model limit int msgs []api.Message expect }{ { name: "messages", + model: visionModel, limit: 64, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -37,6 +62,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "truncate messages", + model: visionModel, limit: 1, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -49,6 +75,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "truncate messages with image", + model: visionModel, limit: 64, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -64,6 +91,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "truncate messages with images", + model: visionModel, limit: 64, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, @@ -79,6 +107,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "messages with images", + model: visionModel, limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, @@ -95,6 +124,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "message with image tag", + model: visionModel, limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}}, @@ -111,6 +141,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "messages with interleaved images", + model: visionModel, limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -129,6 +160,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "truncate message with interleaved images", + model: visionModel, limit: 1024, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -146,6 +178,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "message with system prompt", + model: visionModel, limit: 2048, msgs: []api.Message{ {Role: "system", Content: "You are the Test Who Lived."}, @@ -159,6 +192,7 @@ func TestChatPrompt(t *testing.T) { }, { name: "out of order system", + model: visionModel, limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -170,19 +204,39 @@ func TestChatPrompt(t *testing.T) { prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ", }, }, - } - - tmpl, err := template.Parse(` -{{- if .System }}{{ .System }} {{ end }} -{{- if .Prompt }}{{ .Prompt }} {{ end }} -{{- if .Response }}{{ .Response }} {{ end }}`) - if err != nil { - t.Fatal(err) + { + name: "messages with mllama (no images)", + model: mllamaModel, + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", + }, + }, + { + name: "messages with mllama", + model: mllamaModel, + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}}, + }, + expect: expect{ + prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{imgBuf}, + aspectRatioID: 1, + }, + }, } for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { - model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} + model := tt.model opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}} prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) if err != nil { @@ -202,8 +256,14 @@ func TestChatPrompt(t *testing.T) { t.Errorf("expected ID %d, got %d", i, images[i].ID) } - if !bytes.Equal(images[i].Data, tt.images[i]) { - t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) + if len(model.Config.ModelFamilies) == 0 { + if !bytes.Equal(images[i].Data, tt.images[i]) { + t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) + } + } else { + if images[i].AspectRatioID != tt.aspectRatioID { + t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID) + } } } }) diff --git a/template/template_test.go b/template/template_test.go index 113e0683..616bef6a 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -317,45 +317,6 @@ What is your name?<|im_end|> <|im_start|>assistant `, }, - { - "moondream", - []template{ - // this does not have a "no response" test because it's impossible to render the same output - {"response", `{{ if .Prompt }}Question: {{ .Prompt }} - -{{ end }}Answer: {{ .Response }} - -`}, - {"messages", ` -{{- range .Messages }} -{{- if eq .Role "user" }}Question: {{ .Content }} - -{{ else if eq .Role "assistant" }}Answer: {{ .Content }} - -{{ end }} -{{- end }}Answer: `}, - }, - Values{ - Messages: []api.Message{ - {Role: "user", Content: "What's in this image?", Images: []api.ImageData{[]byte("")}}, - {Role: "assistant", Content: "It's a hot dog."}, - {Role: "user", Content: "What's in _this_ image?"}, - {Role: "user", Images: []api.ImageData{[]byte("")}}, - {Role: "user", Content: "Is it a hot dog?"}, - }, - }, - `Question: [img-0] What's in this image? - -Answer: It's a hot dog. - -Question: What's in _this_ image? - -[img-1] - -Is it a hot dog? - -Answer: `, - }, } for _, tt := range cases {