fix template / imageproc issues

2024-09-26 22:39:45 -07:00 · 2024-09-26 22:39:45 -07:00 · 5486c57364
commit 5486c57364
parent a2d33ee390
4 changed files with 171 additions and 93 deletions
--- a/server/imageproc/images.go
+++ b/server/imageproc/images.go
@ -42,7 +42,7 @@ func min(a, b int) int {
 	return b
 }

-func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
+func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
 	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
 	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)

@ -62,7 +62,7 @@ func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) im
 	return image.Point{w, h}
 }

-func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
+func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
 	possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
 	possibleCanvasSizes := []image.Point{}
 	for _, pta := range possibleTileArrangements {
@ -104,11 +104,13 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
 		selectedScale = minUpscale
 	}

-	selectedCanvas := possibleCanvasSizes[0]
+	var selectedCanvas image.Point
 	for n, pcs := range possibleCanvasSizes {
 		if scales[n] == selectedScale {
-			// choose the largest possible canvas
-			if pcs.X*pcs.Y > selectedCanvas.X*selectedCanvas.Y {
+			// choose the smallest possible canvas
+			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
+				selectedCanvas = pcs
+			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
 				selectedCanvas = pcs
 			}
 		}
@ -116,7 +118,7 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
 	return selectedCanvas
 }

-func SplitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
@ -141,10 +143,9 @@ func ResizeImage(img image.Image, outputSize image.Point, maxImageTiles int) (im
 	b := img.Bounds()
 	tileSize := outputSize.Y

-	canvasSize := GetOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
+	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
 	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-
-	newSize := GetImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
+	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)

 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
 	draw.ApproxBiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
@ -165,7 +166,7 @@ func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image
 }

 func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
-	subImages := SplitToTiles(img, aspectRatio)
+	subImages := splitToTiles(img, aspectRatio)

 	var pixelVals []float32

@ -218,8 +219,6 @@ func Preprocess(imageData []byte) ([]float32, int, error) {
 	newImage, aspectRatio := ResizeImage(img, outputSize, maxTiles)
 	newImage = PadImage(newImage, outputSize, aspectRatio)

-	// todo: need to scale (dim) by 1/256
-
 	data := PackImages(newImage, aspectRatio, mean, std)
 	supportedRatios := GetSupportedAspectRatios(maxTiles)
 	var aspectRatioIndex int
--- a/server/imageproc/images_test.go
+++ b/server/imageproc/images_test.go
@ -1,7 +1,9 @@
 package imageproc

 import (
+	"bytes"
 	"image"
+	"image/png"
 	"reflect"
 	"testing"
 )
@ -27,12 +29,12 @@ func testEq(a, b any) bool {
 }

 func TestAspectRatios(t *testing.T) {
-	type AspectCase struct {
+	type aspectCase struct {
 		MaxTiles int
 		Expected []image.Point
 	}

-	cases := []AspectCase{
+	cases := []aspectCase{
 		{
 			MaxTiles: 1,
 			Expected: []image.Point{{1, 1}},
@ -61,14 +63,14 @@ func TestAspectRatios(t *testing.T) {
 }

 func TestGetImageSizeFitToCanvas(t *testing.T) {
-	type ImageSizeCase struct {
+	type imageSizeCase struct {
 		ImageRect  image.Point
 		CanvasRect image.Point
 		TileSize   int
 		Expected   image.Point
 	}

-	cases := []ImageSizeCase{
+	cases := []imageSizeCase{
 		{
 			ImageRect:  image.Point{400, 400},
 			CanvasRect: image.Point{640, 480},
@ -108,7 +110,7 @@ func TestGetImageSizeFitToCanvas(t *testing.T) {
 	}

 	for _, c := range cases {
-		actual := GetImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
+		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)

 		if actual != c.Expected {
 			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
@ -117,19 +119,19 @@ func TestGetImageSizeFitToCanvas(t *testing.T) {
 }

 func TestGetOptimalTiledCanvas(t *testing.T) {
-	type TiledCanvasSizeCase struct {
+	type tiledCanvasSizeCase struct {
 		ImageSize     image.Point
 		MaxImageTiles int
 		TileSize      int
 		Expected      image.Point
 	}

-	cases := []TiledCanvasSizeCase{
+	cases := []tiledCanvasSizeCase{
 		{
 			ImageSize:     image.Point{1024, 768},
 			MaxImageTiles: 4,
 			TileSize:      1000,
-			Expected:      image.Point{4000, 1000},
+			Expected:      image.Point{2000, 1000},
 		},
 		{
 			ImageSize:     image.Point{1024, 768},
@ -140,7 +142,7 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
 	}

 	for _, c := range cases {
-		actual := GetOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
+		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)

 		if actual != c.Expected {
 			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
@ -149,13 +151,13 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
 }

 func TestSplitToTiles(t *testing.T) {
-	type SplitCase struct {
+	type splitCase struct {
 		TestImage    image.Image
 		NumTilesSize image.Point
 		Expected     []image.Image
 	}

-	cases := []SplitCase{
+	cases := []splitCase{
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			NumTilesSize: image.Point{1, 1},
@ -182,7 +184,7 @@ func TestSplitToTiles(t *testing.T) {
 	}

 	for _, c := range cases {
-		actual := SplitToTiles(c.TestImage, c.NumTilesSize)
+		actual := splitToTiles(c.TestImage, c.NumTilesSize)

 		if len(actual) != len(c.Expected) {
 			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
@ -197,7 +199,7 @@ func TestSplitToTiles(t *testing.T) {
 }

 func TestResize(t *testing.T) {
-	type ResizeCase struct {
+	type resizeCase struct {
 		TestImage           image.Image
 		OutputSize          image.Point
 		MaxImageTiles       int
@ -205,7 +207,7 @@ func TestResize(t *testing.T) {
 		ExpectedAspectRatio image.Point
 	}

-	cases := []ResizeCase{
+	cases := []resizeCase{
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
 			OutputSize:          image.Point{100, 100},
@ -218,7 +220,14 @@ func TestResize(t *testing.T) {
 			OutputSize:          image.Point{100, 100},
 			MaxImageTiles:       2,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 2},
+			ExpectedAspectRatio: image.Point{1, 1},
+		},
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			OutputSize:          image.Point{560, 560},
+			MaxImageTiles:       4,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			ExpectedAspectRatio: image.Point{1, 1},
 		},
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
@ -244,20 +253,20 @@ func TestResize(t *testing.T) {
 		}

 		if actualAspectRatio != c.ExpectedAspectRatio {
-			t.Errorf("canvas size incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
+			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
 		}
 	}
 }

 func TestPad(t *testing.T) {
-	type PadCase struct {
+	type padCase struct {
 		TestImage   image.Image
 		OutputSize  image.Point
 		AspectRatio image.Point
 		Expected    image.Image
 	}

-	cases := []PadCase{
+	cases := []padCase{
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
 			OutputSize:  image.Point{560, 560},
@ -276,30 +285,79 @@ func TestPad(t *testing.T) {
 }

 func TestPackImages(t *testing.T) {
-	type PackCase struct {
-		TestImage   image.Image
-		AspectRatio image.Point
+	type packCase struct {
+		TestImage    image.Image
+		AspectRatio  image.Point
+		ExpectedVals int
 	}

 	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
 	std := [3]float32{0.26862954, 0.26130258, 0.27577711}

-	cases := []PackCase{
+	cases := []packCase{
 		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-			AspectRatio: image.Point{2, 2},
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+			AspectRatio:  image.Point{2, 2},
+			ExpectedVals: 2 * 2 * 3 * 560 * 560,
 		},
 		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			AspectRatio: image.Point{1, 1},
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			AspectRatio:  image.Point{1, 1},
+			ExpectedVals: 1 * 1 * 3 * 560 * 560,
 		},
 		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1120, 560)),
-			AspectRatio: image.Point{1, 2},
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
+			AspectRatio:  image.Point{1, 2},
+			ExpectedVals: 1 * 2 * 3 * 560 * 560,
 		},
 	}

 	for _, c := range cases {
-		PackImages(c.TestImage, c.AspectRatio, mean, std)
+		actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
+		if len(actualVals) != c.ExpectedVals {
+			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	type preprocessCase struct {
+		TestImage             image.Image
+		ExpectedVals          int
+		ExpectedAspectRatioID int
+	}
+
+	cases := []preprocessCase{
+		{
+			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			ExpectedVals:          0,
+			ExpectedAspectRatioID: 1,
+		},
+		{
+			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			ExpectedVals:          0,
+			ExpectedAspectRatioID: 6,
+		},
+	}
+
+	for _, c := range cases {
+		var buf bytes.Buffer
+		err := png.Encode(&buf, c.TestImage)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		imgData, aspectRatioID, err := Preprocess(buf.Bytes())
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		if len(imgData) == 0 {
+			t.Errorf("no image data returned")
+		}
+
+		if aspectRatioID != c.ExpectedAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
+		}
 	}
 }
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -3,6 +3,8 @@ package server
 import (
 	"bytes"
 	"context"
+	"image"
+	"image/png"
 	"testing"

 	"github.com/google/go-cmp/cmp"
@ -13,18 +15,41 @@ import (

 func TestChatPrompt(t *testing.T) {
 	type expect struct {
-		prompt string
-		images [][]byte
+		prompt        string
+		images        [][]byte
+		aspectRatioID int
 	}

+	tmpl, err := template.Parse(`
+{{- if .System }}{{ .System }} {{ end }}
+{{- if .Prompt }}{{ .Prompt }} {{ end }}
+{{- if .Response }}{{ .Response }} {{ end }}`)
+	if err != nil {
+		t.Fatal(err)
+	}
+	visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
+	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
+
+	img := image.NewRGBA(image.Rect(0, 0, 5, 5))
+	var buf bytes.Buffer
+
+	err = png.Encode(&buf, img)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	imgBuf := buf.Bytes()
+
 	cases := []struct {
 		name  string
+		model Model
 		limit int
 		msgs  []api.Message
 		expect
 	}{
 		{
 			name:  "messages",
+			model: visionModel,
 			limit: 64,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -37,6 +62,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "truncate messages",
+			model: visionModel,
 			limit: 1,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -49,6 +75,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "truncate messages with image",
+			model: visionModel,
 			limit: 64,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -64,6 +91,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "truncate messages with images",
+			model: visionModel,
 			limit: 64,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
@ -79,6 +107,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "messages with images",
+			model: visionModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
@ -95,6 +124,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "message with image tag",
+			model: visionModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}},
@ -111,6 +141,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "messages with interleaved images",
+			model: visionModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -129,6 +160,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "truncate message with interleaved images",
+			model: visionModel,
 			limit: 1024,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -146,6 +178,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "message with system prompt",
+			model: visionModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "system", Content: "You are the Test Who Lived."},
@ -159,6 +192,7 @@ func TestChatPrompt(t *testing.T) {
 		},
 		{
 			name:  "out of order system",
+			model: visionModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
@ -170,19 +204,39 @@ func TestChatPrompt(t *testing.T) {
 				prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
 			},
 		},
-	}
-
-	tmpl, err := template.Parse(`
-{{- if .System }}{{ .System }} {{ end }}
-{{- if .Prompt }}{{ .Prompt }} {{ end }}
-{{- if .Response }}{{ .Response }} {{ end }}`)
-	if err != nil {
-		t.Fatal(err)
+		{
+			name:  "messages with mllama (no images)",
+			model: mllamaModel,
+			limit: 2048,
+			msgs: []api.Message{
+				{Role: "user", Content: "You're a test, Harry!"},
+				{Role: "assistant", Content: "I-I'm a what?"},
+				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
+			},
+			expect: expect{
+				prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
+			},
+		},
+		{
+			name:  "messages with mllama",
+			model: mllamaModel,
+			limit: 2048,
+			msgs: []api.Message{
+				{Role: "user", Content: "You're a test, Harry!"},
+				{Role: "assistant", Content: "I-I'm a what?"},
+				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
+			},
+			expect: expect{
+				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
+				images:        [][]byte{imgBuf},
+				aspectRatioID: 1,
+			},
+		},
 	}

 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
+			model := tt.model
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
 			prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
 			if err != nil {
@ -202,8 +256,14 @@ func TestChatPrompt(t *testing.T) {
 					t.Errorf("expected ID %d, got %d", i, images[i].ID)
 				}

-				if !bytes.Equal(images[i].Data, tt.images[i]) {
-					t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
+				if len(model.Config.ModelFamilies) == 0 {
+					if !bytes.Equal(images[i].Data, tt.images[i]) {
+						t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
+					}
+				} else {
+					if images[i].AspectRatioID != tt.aspectRatioID {
+						t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
+					}
 				}
 			}
 		})
--- a/template/template_test.go
+++ b/template/template_test.go
@ -317,45 +317,6 @@ What is your name?<|im_end|>
 <|im_start|>assistant
 `,
 		},
-		{
-			"moondream",
-			[]template{
-				// this does not have a "no response" test because it's impossible to render the same output
-				{"response", `{{ if .Prompt }}Question: {{ .Prompt }}
-
-{{ end }}Answer: {{ .Response }}
-
-`},
-				{"messages", `
-{{- range .Messages }}
-{{- if eq .Role "user" }}Question: {{ .Content }}
-
-{{ else if eq .Role "assistant" }}Answer: {{ .Content }}
-
-{{ end }}
-{{- end }}Answer: `},
-			},
-			Values{
-				Messages: []api.Message{
-					{Role: "user", Content: "What's in this image?", Images: []api.ImageData{[]byte("")}},
-					{Role: "assistant", Content: "It's a hot dog."},
-					{Role: "user", Content: "What's in _this_ image?"},
-					{Role: "user", Images: []api.ImageData{[]byte("")}},
-					{Role: "user", Content: "Is it a hot dog?"},
-				},
-			},
-			`Question: [img-0] What's in this image?
-
-Answer: It's a hot dog.
-
-Question: What's in _this_ image?
-
-[img-1]
-
-Is it a hot dog?
-
-Answer: `,
-		},
 	}

 	for _, tt := range cases {