fix template / imageproc issues

This commit is contained in:
Patrick Devine 2024-09-26 22:39:45 -07:00
parent a2d33ee390
commit 5486c57364
4 changed files with 171 additions and 93 deletions

View File

@ -42,7 +42,7 @@ func min(a, b int) int {
return b
}
func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
@ -62,7 +62,7 @@ func GetImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) im
return image.Point{w, h}
}
func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
@ -104,11 +104,13 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
selectedScale = minUpscale
}
selectedCanvas := possibleCanvasSizes[0]
var selectedCanvas image.Point
for n, pcs := range possibleCanvasSizes {
if scales[n] == selectedScale {
// choose the largest possible canvas
if pcs.X*pcs.Y > selectedCanvas.X*selectedCanvas.Y {
// choose the smallest possible canvas
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
selectedCanvas = pcs
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
selectedCanvas = pcs
}
}
@ -116,7 +118,7 @@ func GetOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
return selectedCanvas
}
func SplitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y
@ -141,10 +143,9 @@ func ResizeImage(img image.Image, outputSize image.Point, maxImageTiles int) (im
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := GetOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := GetImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
draw.ApproxBiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
@ -165,7 +166,7 @@ func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image
}
func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
subImages := SplitToTiles(img, aspectRatio)
subImages := splitToTiles(img, aspectRatio)
var pixelVals []float32
@ -218,8 +219,6 @@ func Preprocess(imageData []byte) ([]float32, int, error) {
newImage, aspectRatio := ResizeImage(img, outputSize, maxTiles)
newImage = PadImage(newImage, outputSize, aspectRatio)
// todo: need to scale (dim) by 1/256
data := PackImages(newImage, aspectRatio, mean, std)
supportedRatios := GetSupportedAspectRatios(maxTiles)
var aspectRatioIndex int

View File

@ -1,7 +1,9 @@
package imageproc
import (
"bytes"
"image"
"image/png"
"reflect"
"testing"
)
@ -27,12 +29,12 @@ func testEq(a, b any) bool {
}
func TestAspectRatios(t *testing.T) {
type AspectCase struct {
type aspectCase struct {
MaxTiles int
Expected []image.Point
}
cases := []AspectCase{
cases := []aspectCase{
{
MaxTiles: 1,
Expected: []image.Point{{1, 1}},
@ -61,14 +63,14 @@ func TestAspectRatios(t *testing.T) {
}
func TestGetImageSizeFitToCanvas(t *testing.T) {
type ImageSizeCase struct {
type imageSizeCase struct {
ImageRect image.Point
CanvasRect image.Point
TileSize int
Expected image.Point
}
cases := []ImageSizeCase{
cases := []imageSizeCase{
{
ImageRect: image.Point{400, 400},
CanvasRect: image.Point{640, 480},
@ -108,7 +110,7 @@ func TestGetImageSizeFitToCanvas(t *testing.T) {
}
for _, c := range cases {
actual := GetImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
@ -117,19 +119,19 @@ func TestGetImageSizeFitToCanvas(t *testing.T) {
}
func TestGetOptimalTiledCanvas(t *testing.T) {
type TiledCanvasSizeCase struct {
type tiledCanvasSizeCase struct {
ImageSize image.Point
MaxImageTiles int
TileSize int
Expected image.Point
}
cases := []TiledCanvasSizeCase{
cases := []tiledCanvasSizeCase{
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 1000,
Expected: image.Point{4000, 1000},
Expected: image.Point{2000, 1000},
},
{
ImageSize: image.Point{1024, 768},
@ -140,7 +142,7 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
}
for _, c := range cases {
actual := GetOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
@ -149,13 +151,13 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
}
func TestSplitToTiles(t *testing.T) {
type SplitCase struct {
type splitCase struct {
TestImage image.Image
NumTilesSize image.Point
Expected []image.Image
}
cases := []SplitCase{
cases := []splitCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
NumTilesSize: image.Point{1, 1},
@ -182,7 +184,7 @@ func TestSplitToTiles(t *testing.T) {
}
for _, c := range cases {
actual := SplitToTiles(c.TestImage, c.NumTilesSize)
actual := splitToTiles(c.TestImage, c.NumTilesSize)
if len(actual) != len(c.Expected) {
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
@ -197,7 +199,7 @@ func TestSplitToTiles(t *testing.T) {
}
func TestResize(t *testing.T) {
type ResizeCase struct {
type resizeCase struct {
TestImage image.Image
OutputSize image.Point
MaxImageTiles int
@ -205,7 +207,7 @@ func TestResize(t *testing.T) {
ExpectedAspectRatio image.Point
}
cases := []ResizeCase{
cases := []resizeCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
@ -218,7 +220,14 @@ func TestResize(t *testing.T) {
OutputSize: image.Point{100, 100},
MaxImageTiles: 2,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 2},
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
@ -244,20 +253,20 @@ func TestResize(t *testing.T) {
}
if actualAspectRatio != c.ExpectedAspectRatio {
t.Errorf("canvas size incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
}
}
}
func TestPad(t *testing.T) {
type PadCase struct {
type padCase struct {
TestImage image.Image
OutputSize image.Point
AspectRatio image.Point
Expected image.Image
}
cases := []PadCase{
cases := []padCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
OutputSize: image.Point{560, 560},
@ -276,30 +285,79 @@ func TestPad(t *testing.T) {
}
func TestPackImages(t *testing.T) {
type PackCase struct {
TestImage image.Image
AspectRatio image.Point
type packCase struct {
TestImage image.Image
AspectRatio image.Point
ExpectedVals int
}
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
cases := []PackCase{
cases := []packCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
AspectRatio: image.Point{2, 2},
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
AspectRatio: image.Point{2, 2},
ExpectedVals: 2 * 2 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
AspectRatio: image.Point{1, 1},
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
AspectRatio: image.Point{1, 1},
ExpectedVals: 1 * 1 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
AspectRatio: image.Point{1, 2},
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
AspectRatio: image.Point{1, 2},
ExpectedVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, c := range cases {
PackImages(c.TestImage, c.AspectRatio, mean, std)
actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
if len(actualVals) != c.ExpectedVals {
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
}
}
}
func TestPreprocess(t *testing.T) {
type preprocessCase struct {
TestImage image.Image
ExpectedVals int
ExpectedAspectRatioID int
}
cases := []preprocessCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
ExpectedVals: 0,
ExpectedAspectRatioID: 1,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedVals: 0,
ExpectedAspectRatioID: 6,
},
}
for _, c := range cases {
var buf bytes.Buffer
err := png.Encode(&buf, c.TestImage)
if err != nil {
t.Fatal(err)
}
imgData, aspectRatioID, err := Preprocess(buf.Bytes())
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(imgData) == 0 {
t.Errorf("no image data returned")
}
if aspectRatioID != c.ExpectedAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
}
}
}

View File

@ -3,6 +3,8 @@ package server
import (
"bytes"
"context"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
@ -13,18 +15,41 @@ import (
func TestChatPrompt(t *testing.T) {
type expect struct {
prompt string
images [][]byte
prompt string
images [][]byte
aspectRatioID int
}
tmpl, err := template.Parse(`
{{- if .System }}{{ .System }} {{ end }}
{{- if .Prompt }}{{ .Prompt }} {{ end }}
{{- if .Response }}{{ .Response }} {{ end }}`)
if err != nil {
t.Fatal(err)
}
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
img := image.NewRGBA(image.Rect(0, 0, 5, 5))
var buf bytes.Buffer
err = png.Encode(&buf, img)
if err != nil {
t.Fatal(err)
}
imgBuf := buf.Bytes()
cases := []struct {
name string
model Model
limit int
msgs []api.Message
expect
}{
{
name: "messages",
model: visionModel,
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -37,6 +62,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "truncate messages",
model: visionModel,
limit: 1,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -49,6 +75,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "truncate messages with image",
model: visionModel,
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -64,6 +91,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "truncate messages with images",
model: visionModel,
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
@ -79,6 +107,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "messages with images",
model: visionModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
@ -95,6 +124,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "message with image tag",
model: visionModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}},
@ -111,6 +141,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "messages with interleaved images",
model: visionModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -129,6 +160,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "truncate message with interleaved images",
model: visionModel,
limit: 1024,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -146,6 +178,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "message with system prompt",
model: visionModel,
limit: 2048,
msgs: []api.Message{
{Role: "system", Content: "You are the Test Who Lived."},
@ -159,6 +192,7 @@ func TestChatPrompt(t *testing.T) {
},
{
name: "out of order system",
model: visionModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
@ -170,19 +204,39 @@ func TestChatPrompt(t *testing.T) {
prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
},
},
}
tmpl, err := template.Parse(`
{{- if .System }}{{ .System }} {{ end }}
{{- if .Prompt }}{{ .Prompt }} {{ end }}
{{- if .Response }}{{ .Response }} {{ end }}`)
if err != nil {
t.Fatal(err)
{
name: "messages with mllama (no images)",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
},
{
name: "messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
model := tt.model
opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
if err != nil {
@ -202,8 +256,14 @@ func TestChatPrompt(t *testing.T) {
t.Errorf("expected ID %d, got %d", i, images[i].ID)
}
if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
if len(model.Config.ModelFamilies) == 0 {
if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
}
} else {
if images[i].AspectRatioID != tt.aspectRatioID {
t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
}
}
}
})

View File

@ -317,45 +317,6 @@ What is your name?<|im_end|>
<|im_start|>assistant
`,
},
{
"moondream",
[]template{
// this does not have a "no response" test because it's impossible to render the same output
{"response", `{{ if .Prompt }}Question: {{ .Prompt }}
{{ end }}Answer: {{ .Response }}
`},
{"messages", `
{{- range .Messages }}
{{- if eq .Role "user" }}Question: {{ .Content }}
{{ else if eq .Role "assistant" }}Answer: {{ .Content }}
{{ end }}
{{- end }}Answer: `},
},
Values{
Messages: []api.Message{
{Role: "user", Content: "What's in this image?", Images: []api.ImageData{[]byte("")}},
{Role: "assistant", Content: "It's a hot dog."},
{Role: "user", Content: "What's in _this_ image?"},
{Role: "user", Images: []api.ImageData{[]byte("")}},
{Role: "user", Content: "Is it a hot dog?"},
},
},
`Question: [img-0] What's in this image?
Answer: It's a hot dog.
Question: What's in _this_ image?
[img-1]
Is it a hot dog?
Answer: `,
},
}
for _, tt := range cases {