diff --git a/models/imageproc/images.go b/models/imageproc/images.go new file mode 100644 index 00000000..7afe3670 --- /dev/null +++ b/models/imageproc/images.go @@ -0,0 +1,111 @@ +package imageproc + +import ( + "image" + "image/color" + + "golang.org/x/image/draw" +) + +var ( + ImageNetDefaultMean = [3]float32{0.485, 0.456, 0.406} + ImageNetDefaultSTD = [3]float32{0.229, 0.224, 0.225} + ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5} + ImageNetStandardSTD = [3]float32{0.5, 0.5, 0.5} + ClipDefaultMean = [3]float32{0.48145466, 0.4578275, 0.40821073} + ClipDefaultSTD = [3]float32{0.26862954, 0.26130258, 0.27577711} +) + +const ( + ResizeBilinear = iota + ResizeNearestNeighbor + ResizeApproxBilinear + ResizeCatmullrom +) + +// Composite returns an image with the alpha channel removed by drawing over a white background. +func Composite(img image.Image) image.Image { + dst := image.NewRGBA(img.Bounds()) + + white := color.RGBA{255, 255, 255, 255} + draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) + draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) + + return dst +} + +// Resize returns an image which has been scaled to a new size. +func Resize(img image.Image, newSize image.Point, method int) image.Image { + dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) + + kernels := map[int]draw.Interpolator{ + ResizeBilinear: draw.BiLinear, + ResizeNearestNeighbor: draw.NearestNeighbor, + ResizeApproxBilinear: draw.ApproxBiLinear, + ResizeCatmullrom: draw.CatmullRom, + } + + kernel, ok := kernels[method] + if !ok { + panic("no resizing method found") + } + + kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil) + + return dst +} + +// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value. +func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 { + var pixelVals []float32 + + bounds := img.Bounds() + if channelFirst { + var rVals, gVals, bVals []float32 + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { + c := img.At(x, y) + r, g, b, _ := c.RGBA() + var rVal, gVal, bVal float32 + if rescale { + rVal = float32(r>>8) / 255.0 + gVal = float32(g>>8) / 255.0 + bVal = float32(b>>8) / 255.0 + } + + rVal = (rVal - mean[0]) / std[0] + gVal = (gVal - mean[1]) / std[1] + bVal = (bVal - mean[2]) / std[2] + + rVals = append(rVals, rVal) + gVals = append(gVals, gVal) + bVals = append(bVals, bVal) + } + } + + pixelVals = append(pixelVals, rVals...) + pixelVals = append(pixelVals, gVals...) + pixelVals = append(pixelVals, bVals...) + } else { + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { + c := img.At(x, y) + r, g, b, _ := c.RGBA() + var rVal, gVal, bVal float32 + if rescale { + rVal = float32(r>>8) / 255.0 + gVal = float32(g>>8) / 255.0 + bVal = float32(b>>8) / 255.0 + } + + rVal = (rVal - mean[0]) / std[0] + gVal = (gVal - mean[1]) / std[1] + bVal = (bVal - mean[2]) / std[2] + + pixelVals = append(pixelVals, rVal, gVal, bVal) + } + } + } + + return pixelVals +} diff --git a/models/imageproc/images_test.go b/models/imageproc/images_test.go new file mode 100644 index 00000000..a2e9ed94 --- /dev/null +++ b/models/imageproc/images_test.go @@ -0,0 +1,177 @@ +package imageproc + +import ( + "image" + "image/color" + "image/draw" + "reflect" + "testing" +) + +func createImage(width, height int, fillCol color.RGBA) image.Image { + img := image.NewRGBA(image.Rect(0, 0, width, height)) + draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src) + return img +} + +func TestComposite(t *testing.T) { + tests := []struct { + name string + img image.Image + expectedRGBA color.RGBA + }{ + { + name: "Transparent image", + img: createImage(5, 5, color.RGBA{0, 0, 0, 0}), + expectedRGBA: color.RGBA{255, 255, 255, 255}, + }, + { + name: "Solid red image", + img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), + expectedRGBA: color.RGBA{255, 0, 0, 255}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + resultImg := Composite(tt.img) + + // Check the pixel values in the resulting image + for x := range resultImg.Bounds().Dx() { + for y := range resultImg.Bounds().Dy() { + r, g, b, a := resultImg.At(x, y).RGBA() + expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA() + + if r != expectedR || g != expectedG || b != expectedB || a != expectedA { + t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)", + x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA) + } + } + } + }) + } +} + +func TestResize(t *testing.T) { + tests := []struct { + name string + img image.Image + newSize image.Point + method int + expected image.Point + }{ + { + name: "Resize with bilinear interpolation", + img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), + newSize: image.Point{10, 10}, + method: ResizeBilinear, + expected: image.Point{10, 10}, + }, + { + name: "Resize with nearest neighbor", + img: createImage(10, 10, color.RGBA{0, 255, 0, 255}), + newSize: image.Point{5, 5}, + method: ResizeNearestNeighbor, + expected: image.Point{5, 5}, + }, + { + name: "Resize with catmullrom", + img: createImage(1024, 1024, color.RGBA{0, 0, 255, 255}), + newSize: image.Point{10, 10}, + method: ResizeCatmullrom, + expected: image.Point{10, 10}, + }, + { + name: "Resize with approx bilinear", + img: createImage(1024, 768, color.RGBA{100, 100, 100, 255}), + newSize: image.Point{4, 3}, + method: ResizeApproxBilinear, + expected: image.Point{4, 3}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + resizedImg := Resize(tt.img, tt.newSize, tt.method) + + if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y { + t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)", + resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y) + } + }) + } +} + +func TestResizeInvalidMethod(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic for invalid resizing method, but did not panic") + } + }() + + img := createImage(10, 10, color.RGBA{0, 0, 0, 255}) + Resize(img, image.Point{5, 5}, -1) +} + +func TestNormalize(t *testing.T) { + tests := []struct { + name string + img image.Image + mean [3]float32 + std [3]float32 + rescale bool + channelFirst bool + expected []float32 + }{ + { + name: "Rescale with channel first", + img: createImage(2, 2, color.RGBA{128, 128, 128, 255}), + mean: ImageNetStandardMean, + std: ImageNetStandardSTD, + rescale: true, + channelFirst: true, + expected: []float32{ + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values + }, + }, + { + name: "Rescale without channel first", + img: createImage(2, 2, color.RGBA{255, 0, 0, 255}), + mean: [3]float32{0.0, 0.0, 0.0}, + std: [3]float32{1.0, 1.0, 1.0}, + rescale: true, + channelFirst: false, + expected: []float32{ + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + }, + }, + { + name: "No rescale with mean/std adjustment", + img: createImage(2, 2, color.RGBA{100, 150, 200, 255}), + mean: ClipDefaultMean, + std: ClipDefaultSTD, + rescale: false, + channelFirst: false, + expected: []float32{ + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst) + + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected) + } + }) + } +} diff --git a/server/imageproc/images.go b/models/mllama/imageproc.go similarity index 60% rename from server/imageproc/images.go rename to models/mllama/imageproc.go index 688cbf8a..d7a5ad8d 100644 --- a/server/imageproc/images.go +++ b/models/mllama/imageproc.go @@ -1,19 +1,20 @@ -package imageproc +package mllama import ( - "bytes" "fmt" "image" - "image/color" _ "image/jpeg" _ "image/png" + "io" "math" "slices" "golang.org/x/image/draw" + + "github.com/ollama/ollama/models/imageproc" ) -func GetSupportedAspectRatios(maxTiles int) []image.Point { +func getSupportedAspectRatios(maxTiles int) []image.Point { ratios := []image.Point{} for w := range maxTiles { @@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int { return a } -func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } - - return image.Point{w, h} -} - func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles) + possibleTileArrangements := getSupportedAspectRatios(maxImageTiles) possibleCanvasSizes := []image.Point{} for _, pta := range possibleTileArrangements { possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) @@ -113,6 +94,53 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i return selectedCanvas } +func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { + targetWidth := clip(imageSize.X, tileSize, canvasSize.X) + targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) + + scaleWidth := float64(targetWidth) / float64(imageSize.X) + scaleHeight := float64(targetHeight) / float64(imageSize.Y) + + var w, h int + + if scaleWidth < scaleHeight { + w = targetWidth + h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) + } else { + w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) + h = targetHeight + } + + return image.Point{w, h} +} + +func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { + if format == "png" { + img = imageproc.Composite(img) + } + + b := img.Bounds() + tileSize := outputSize.Y + + canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) + aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} + newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) + + return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio +} + +func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { + paddedSize := image.Point{ + X: outputSize.X * aspectRatio.X, + Y: outputSize.Y * aspectRatio.Y, + } + + dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) + draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) + + return dst +} + func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X @@ -134,107 +162,40 @@ func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { return images } -// remove the "alpha" channel by drawing over a prefilled image -func compositeImage(img image.Image) image.Image { - dst := image.NewRGBA(img.Bounds()) - - white := color.RGBA{255, 255, 255, 255} - draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) - draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) - - return dst -} - -func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { - if format == "png" { - img = compositeImage(img) - } - - b := img.Bounds() - tileSize := outputSize.Y - - canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) - - dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) - - // scaling choices: - // NearestNeighbor fast, blocky output - // ApproxBiLinear fast, medium quality - // BiLinear slow, high quality - // CatmullRom very slow, very high quality - draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil) - - return dst, aspectRatio -} - -func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { - paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, - } - - dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) - draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) - - return dst -} - -func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { +func packImages(img image.Image, aspectRatio image.Point) []float32 { subImages := splitToTiles(img, aspectRatio) var pixelVals []float32 + rescale := true + channelFirst := true + for _, subImg := range subImages { - bounds := subImg.Bounds() - var rVals, gVals, bVals []float32 - for y := bounds.Min.Y; y < bounds.Max.Y; y++ { - for x := bounds.Min.X; x < bounds.Max.X; x++ { - c := subImg.At(x, y) - r, g, b, _ := c.RGBA() - rVal := float32(r>>8) / 255.0 - gVal := float32(g>>8) / 255.0 - bVal := float32(b>>8) / 255.0 - - rVal = (rVal - mean[0]) / std[0] - gVal = (gVal - mean[1]) / std[1] - bVal = (bVal - mean[2]) / std[2] - - rVals = append(rVals, rVal) - gVals = append(gVals, gVal) - bVals = append(bVals, bVal) - } - } - pixelVals = append(pixelVals, rVals...) - pixelVals = append(pixelVals, gVals...) - pixelVals = append(pixelVals, bVals...) + vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst) + pixelVals = append(pixelVals, vals...) } return pixelVals } -func Preprocess(imageData []byte) ([]float32, int, error) { - // todo: need guard in here for bad image data - - // mllama values +func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { outputSize := image.Point{560, 560} maxTiles := 4 - // clip values - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} - - img, format, err := image.Decode(bytes.NewReader(imageData)) + img, format, err := image.Decode(imageData) if err != nil { - return nil, 0, fmt.Errorf("failed to decode image: %w", err) + return nil, nil, fmt.Errorf("failed to decode image: %w", err) } - newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles) - newImage = PadImage(newImage, outputSize, aspectRatio) + newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles) + newImage = padImage(newImage, outputSize, aspectRatio) - data := PackImages(newImage, aspectRatio, mean, std) - aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1 + data := packImages(newImage, aspectRatio) + aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1 - return data, aspectRatioIndex, nil + opts := map[string]any{ + "aspectRatioIndex": aspectRatioIndex, + } + + return data, opts, nil } diff --git a/server/imageproc/images_test.go b/models/mllama/imageproc_test.go similarity index 95% rename from server/imageproc/images_test.go rename to models/mllama/imageproc_test.go index 7ad5329b..a14b91bd 100644 --- a/server/imageproc/images_test.go +++ b/models/mllama/imageproc_test.go @@ -1,4 +1,4 @@ -package imageproc +package mllama import ( "bytes" @@ -35,7 +35,7 @@ func TestAspectRatios(t *testing.T) { } for _, c := range cases { - actual := GetSupportedAspectRatios(c.MaxTiles) + actual := getSupportedAspectRatios(c.MaxTiles) if diff := cmp.Diff(actual, c.Expected); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) @@ -299,7 +299,7 @@ func TestResize(t *testing.T) { } for _, c := range cases { - actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) + actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) if actualImage.Bounds() != c.ExpectedImage.Bounds() { t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds()) @@ -329,7 +329,7 @@ func TestPad(t *testing.T) { } for _, c := range cases { - actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio) + actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio) if actual.Bounds() != c.Expected.Bounds() { t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) @@ -344,9 +344,6 @@ func TestPackImages(t *testing.T) { ExpectedVals int } - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} - cases := []packCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), @@ -366,7 +363,7 @@ func TestPackImages(t *testing.T) { } for _, c := range cases { - actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std) + actualVals := packImages(c.TestImage, c.AspectRatio) if len(actualVals) != c.ExpectedVals { t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) } @@ -400,7 +397,7 @@ func TestPreprocess(t *testing.T) { t.Fatal(err) } - imgData, aspectRatioID, err := Preprocess(buf.Bytes()) + imgData, opts, err := Preprocess(&buf) if err != nil { t.Fatalf("error processing: %q", err) } @@ -409,6 +406,13 @@ func TestPreprocess(t *testing.T) { t.Errorf("no image data returned") } + ar, ok := opts["aspectRatioIndex"] + if !ok { + t.Fatalf("no aspect ratio found") + } + + aspectRatioID := ar.(int) + if aspectRatioID != c.ExpectedAspectRatioID { t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) } diff --git a/server/prompt.go b/server/prompt.go index a6401983..ff48b43d 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -11,7 +11,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/server/imageproc" + "github.com/ollama/ollama/models/mllama" "github.com/ollama/ollama/template" ) @@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. var imgData llm.ImageData if isMllama { - data, aspectRatioID, err := imageproc.Preprocess(i) + data, opts, err := mllama.Preprocess(bytes.NewReader(i)) if err != nil { return "", nil, err } @@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. return "", nil, err } + ar, ok := opts["aspectRatioIndex"].(int) + if !ok { + return "", nil, fmt.Errorf("missing aspect ratio for image") + } + imgData = llm.ImageData{ ID: len(images), Data: buf.Bytes(), - AspectRatioID: aspectRatioID, + AspectRatioID: ar, } imgPrompt = "<|image|>" } else { diff --git a/server/routes.go b/server/routes.go index c5fd3293..ccd86265 100644 --- a/server/routes.go +++ b/server/routes.go @@ -31,10 +31,10 @@ import ( "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/models/mllama" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/runners" - "github.com/ollama/ollama/server/imageproc" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -192,12 +192,18 @@ func (s *Server) GenerateHandler(c *gin.Context) { images := make([]llm.ImageData, len(req.Images)) for i := range req.Images { if isMllama { - data, aspectRatioID, err := imageproc.Preprocess(req.Images[i]) + data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i])) if err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) return } + ar, ok := opts["aspectRatioIndex"].(int) + if !ok { + c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) + return + } + buf := new(bytes.Buffer) err = binary.Write(buf, binary.LittleEndian, data) if err != nil { @@ -205,7 +211,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID} + images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar} } else { images[i] = llm.ImageData{ID: i, Data: req.Images[i]} }