imageproc mllama refactor

This commit is contained in:
Patrick Devine 2024-10-13 22:30:25 -07:00
parent d7eb05b936
commit 685125ab03
6 changed files with 389 additions and 125 deletions

111
models/imageproc/images.go Normal file
View File

@ -0,0 +1,111 @@
package imageproc
import (
"image"
"image/color"
"golang.org/x/image/draw"
)
var (
ImageNetDefaultMean = [3]float32{0.485, 0.456, 0.406}
ImageNetDefaultSTD = [3]float32{0.229, 0.224, 0.225}
ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5}
ImageNetStandardSTD = [3]float32{0.5, 0.5, 0.5}
ClipDefaultMean = [3]float32{0.48145466, 0.4578275, 0.40821073}
ClipDefaultSTD = [3]float32{0.26862954, 0.26130258, 0.27577711}
)
const (
ResizeBilinear = iota
ResizeNearestNeighbor
ResizeApproxBilinear
ResizeCatmullrom
)
// Composite returns an image with the alpha channel removed by drawing over a white background.
func Composite(img image.Image) image.Image {
dst := image.NewRGBA(img.Bounds())
white := color.RGBA{255, 255, 255, 255}
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
return dst
}
// Resize returns an image which has been scaled to a new size.
func Resize(img image.Image, newSize image.Point, method int) image.Image {
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
kernels := map[int]draw.Interpolator{
ResizeBilinear: draw.BiLinear,
ResizeNearestNeighbor: draw.NearestNeighbor,
ResizeApproxBilinear: draw.ApproxBiLinear,
ResizeCatmullrom: draw.CatmullRom,
}
kernel, ok := kernels[method]
if !ok {
panic("no resizing method found")
}
kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil)
return dst
}
// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 {
var pixelVals []float32
bounds := img.Bounds()
if channelFirst {
var rVals, gVals, bVals []float32
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
for x := bounds.Min.X; x < bounds.Max.X; x++ {
c := img.At(x, y)
r, g, b, _ := c.RGBA()
var rVal, gVal, bVal float32
if rescale {
rVal = float32(r>>8) / 255.0
gVal = float32(g>>8) / 255.0
bVal = float32(b>>8) / 255.0
}
rVal = (rVal - mean[0]) / std[0]
gVal = (gVal - mean[1]) / std[1]
bVal = (bVal - mean[2]) / std[2]
rVals = append(rVals, rVal)
gVals = append(gVals, gVal)
bVals = append(bVals, bVal)
}
}
pixelVals = append(pixelVals, rVals...)
pixelVals = append(pixelVals, gVals...)
pixelVals = append(pixelVals, bVals...)
} else {
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
for x := bounds.Min.X; x < bounds.Max.X; x++ {
c := img.At(x, y)
r, g, b, _ := c.RGBA()
var rVal, gVal, bVal float32
if rescale {
rVal = float32(r>>8) / 255.0
gVal = float32(g>>8) / 255.0
bVal = float32(b>>8) / 255.0
}
rVal = (rVal - mean[0]) / std[0]
gVal = (gVal - mean[1]) / std[1]
bVal = (bVal - mean[2]) / std[2]
pixelVals = append(pixelVals, rVal, gVal, bVal)
}
}
}
return pixelVals
}

View File

@ -0,0 +1,177 @@
package imageproc
import (
"image"
"image/color"
"image/draw"
"reflect"
"testing"
)
func createImage(width, height int, fillCol color.RGBA) image.Image {
img := image.NewRGBA(image.Rect(0, 0, width, height))
draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src)
return img
}
func TestComposite(t *testing.T) {
tests := []struct {
name string
img image.Image
expectedRGBA color.RGBA
}{
{
name: "Transparent image",
img: createImage(5, 5, color.RGBA{0, 0, 0, 0}),
expectedRGBA: color.RGBA{255, 255, 255, 255},
},
{
name: "Solid red image",
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}),
expectedRGBA: color.RGBA{255, 0, 0, 255},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resultImg := Composite(tt.img)
// Check the pixel values in the resulting image
for x := range resultImg.Bounds().Dx() {
for y := range resultImg.Bounds().Dy() {
r, g, b, a := resultImg.At(x, y).RGBA()
expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA()
if r != expectedR || g != expectedG || b != expectedB || a != expectedA {
t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)",
x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA)
}
}
}
})
}
}
func TestResize(t *testing.T) {
tests := []struct {
name string
img image.Image
newSize image.Point
method int
expected image.Point
}{
{
name: "Resize with bilinear interpolation",
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}),
newSize: image.Point{10, 10},
method: ResizeBilinear,
expected: image.Point{10, 10},
},
{
name: "Resize with nearest neighbor",
img: createImage(10, 10, color.RGBA{0, 255, 0, 255}),
newSize: image.Point{5, 5},
method: ResizeNearestNeighbor,
expected: image.Point{5, 5},
},
{
name: "Resize with catmullrom",
img: createImage(1024, 1024, color.RGBA{0, 0, 255, 255}),
newSize: image.Point{10, 10},
method: ResizeCatmullrom,
expected: image.Point{10, 10},
},
{
name: "Resize with approx bilinear",
img: createImage(1024, 768, color.RGBA{100, 100, 100, 255}),
newSize: image.Point{4, 3},
method: ResizeApproxBilinear,
expected: image.Point{4, 3},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resizedImg := Resize(tt.img, tt.newSize, tt.method)
if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y {
t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)",
resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y)
}
})
}
}
func TestResizeInvalidMethod(t *testing.T) {
defer func() {
if r := recover(); r == nil {
t.Errorf("Expected panic for invalid resizing method, but did not panic")
}
}()
img := createImage(10, 10, color.RGBA{0, 0, 0, 255})
Resize(img, image.Point{5, 5}, -1)
}
func TestNormalize(t *testing.T) {
tests := []struct {
name string
img image.Image
mean [3]float32
std [3]float32
rescale bool
channelFirst bool
expected []float32
}{
{
name: "Rescale with channel first",
img: createImage(2, 2, color.RGBA{128, 128, 128, 255}),
mean: ImageNetStandardMean,
std: ImageNetStandardSTD,
rescale: true,
channelFirst: true,
expected: []float32{
0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values
0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values
0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values
},
},
{
name: "Rescale without channel first",
img: createImage(2, 2, color.RGBA{255, 0, 0, 255}),
mean: [3]float32{0.0, 0.0, 0.0},
std: [3]float32{1.0, 1.0, 1.0},
rescale: true,
channelFirst: false,
expected: []float32{
1.0, 0.0, 0.0,
1.0, 0.0, 0.0,
1.0, 0.0, 0.0,
1.0, 0.0, 0.0,
},
},
{
name: "No rescale with mean/std adjustment",
img: createImage(2, 2, color.RGBA{100, 150, 200, 255}),
mean: ClipDefaultMean,
std: ClipDefaultSTD,
rescale: false,
channelFirst: false,
expected: []float32{
-1.7922626, -1.7520971, -1.4802198,
-1.7922626, -1.7520971, -1.4802198,
-1.7922626, -1.7520971, -1.4802198,
-1.7922626, -1.7520971, -1.4802198,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst)
if !reflect.DeepEqual(result, tt.expected) {
t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected)
}
})
}
}

View File

@ -1,19 +1,20 @@
package imageproc
package mllama
import (
"bytes"
"fmt"
"image"
"image/color"
_ "image/jpeg"
_ "image/png"
"io"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/models/imageproc"
)
func GetSupportedAspectRatios(maxTiles int) []image.Point {
func getSupportedAspectRatios(maxTiles int) []image.Point {
ratios := []image.Point{}
for w := range maxTiles {
@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int {
return a
}
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
@ -113,6 +94,53 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
return selectedCanvas
}
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
if format == "png" {
img = imageproc.Composite(img)
}
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
}
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
@ -134,107 +162,40 @@ func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
return images
}
// remove the "alpha" channel by drawing over a prefilled image
func compositeImage(img image.Image) image.Image {
dst := image.NewRGBA(img.Bounds())
white := color.RGBA{255, 255, 255, 255}
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
return dst
}
func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
if format == "png" {
img = compositeImage(img)
}
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
// scaling choices:
// NearestNeighbor fast, blocky output
// ApproxBiLinear fast, medium quality
// BiLinear slow, high quality
// CatmullRom very slow, very high quality
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
return dst, aspectRatio
}
func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
func packImages(img image.Image, aspectRatio image.Point) []float32 {
subImages := splitToTiles(img, aspectRatio)
var pixelVals []float32
rescale := true
channelFirst := true
for _, subImg := range subImages {
bounds := subImg.Bounds()
var rVals, gVals, bVals []float32
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
for x := bounds.Min.X; x < bounds.Max.X; x++ {
c := subImg.At(x, y)
r, g, b, _ := c.RGBA()
rVal := float32(r>>8) / 255.0
gVal := float32(g>>8) / 255.0
bVal := float32(b>>8) / 255.0
rVal = (rVal - mean[0]) / std[0]
gVal = (gVal - mean[1]) / std[1]
bVal = (bVal - mean[2]) / std[2]
rVals = append(rVals, rVal)
gVals = append(gVals, gVal)
bVals = append(bVals, bVal)
}
}
pixelVals = append(pixelVals, rVals...)
pixelVals = append(pixelVals, gVals...)
pixelVals = append(pixelVals, bVals...)
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
pixelVals = append(pixelVals, vals...)
}
return pixelVals
}
func Preprocess(imageData []byte) ([]float32, int, error) {
// todo: need guard in here for bad image data
// mllama values
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
outputSize := image.Point{560, 560}
maxTiles := 4
// clip values
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
img, format, err := image.Decode(bytes.NewReader(imageData))
img, format, err := image.Decode(imageData)
if err != nil {
return nil, 0, fmt.Errorf("failed to decode image: %w", err)
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
}
newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
newImage = PadImage(newImage, outputSize, aspectRatio)
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
newImage = padImage(newImage, outputSize, aspectRatio)
data := PackImages(newImage, aspectRatio, mean, std)
aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1
data := packImages(newImage, aspectRatio)
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
return data, aspectRatioIndex, nil
opts := map[string]any{
"aspectRatioIndex": aspectRatioIndex,
}
return data, opts, nil
}

View File

@ -1,4 +1,4 @@
package imageproc
package mllama
import (
"bytes"
@ -35,7 +35,7 @@ func TestAspectRatios(t *testing.T) {
}
for _, c := range cases {
actual := GetSupportedAspectRatios(c.MaxTiles)
actual := getSupportedAspectRatios(c.MaxTiles)
if diff := cmp.Diff(actual, c.Expected); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
@ -299,7 +299,7 @@ func TestResize(t *testing.T) {
}
for _, c := range cases {
actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
@ -329,7 +329,7 @@ func TestPad(t *testing.T) {
}
for _, c := range cases {
actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio)
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
if actual.Bounds() != c.Expected.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
@ -344,9 +344,6 @@ func TestPackImages(t *testing.T) {
ExpectedVals int
}
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
cases := []packCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
@ -366,7 +363,7 @@ func TestPackImages(t *testing.T) {
}
for _, c := range cases {
actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
actualVals := packImages(c.TestImage, c.AspectRatio)
if len(actualVals) != c.ExpectedVals {
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
}
@ -400,7 +397,7 @@ func TestPreprocess(t *testing.T) {
t.Fatal(err)
}
imgData, aspectRatioID, err := Preprocess(buf.Bytes())
imgData, opts, err := Preprocess(&buf)
if err != nil {
t.Fatalf("error processing: %q", err)
}
@ -409,6 +406,13 @@ func TestPreprocess(t *testing.T) {
t.Errorf("no image data returned")
}
ar, ok := opts["aspectRatioIndex"]
if !ok {
t.Fatalf("no aspect ratio found")
}
aspectRatioID := ar.(int)
if aspectRatioID != c.ExpectedAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
}

View File

@ -11,7 +11,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/imageproc"
"github.com/ollama/ollama/models/mllama"
"github.com/ollama/ollama/template"
)
@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
var imgData llm.ImageData
if isMllama {
data, aspectRatioID, err := imageproc.Preprocess(i)
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: aspectRatioID,
AspectRatioID: ar,
}
imgPrompt = "<|image|>"
} else {

View File

@ -31,10 +31,10 @@ import (
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/models/mllama"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/runners"
"github.com/ollama/ollama/server/imageproc"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
@ -192,12 +192,18 @@ func (s *Server) GenerateHandler(c *gin.Context) {
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
if isMllama {
data, aspectRatioID, err := imageproc.Preprocess(req.Images[i])
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
@ -205,7 +211,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
} else {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}