imageproc mllama refactor
This commit is contained in:
parent
d7eb05b936
commit
685125ab03
111
models/imageproc/images.go
Normal file
111
models/imageproc/images.go
Normal file
@ -0,0 +1,111 @@
|
||||
package imageproc
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
)
|
||||
|
||||
var (
|
||||
ImageNetDefaultMean = [3]float32{0.485, 0.456, 0.406}
|
||||
ImageNetDefaultSTD = [3]float32{0.229, 0.224, 0.225}
|
||||
ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5}
|
||||
ImageNetStandardSTD = [3]float32{0.5, 0.5, 0.5}
|
||||
ClipDefaultMean = [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
ClipDefaultSTD = [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
)
|
||||
|
||||
const (
|
||||
ResizeBilinear = iota
|
||||
ResizeNearestNeighbor
|
||||
ResizeApproxBilinear
|
||||
ResizeCatmullrom
|
||||
)
|
||||
|
||||
// Composite returns an image with the alpha channel removed by drawing over a white background.
|
||||
func Composite(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
// Resize returns an image which has been scaled to a new size.
|
||||
func Resize(img image.Image, newSize image.Point, method int) image.Image {
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
kernels := map[int]draw.Interpolator{
|
||||
ResizeBilinear: draw.BiLinear,
|
||||
ResizeNearestNeighbor: draw.NearestNeighbor,
|
||||
ResizeApproxBilinear: draw.ApproxBiLinear,
|
||||
ResizeCatmullrom: draw.CatmullRom,
|
||||
}
|
||||
|
||||
kernel, ok := kernels[method]
|
||||
if !ok {
|
||||
panic("no resizing method found")
|
||||
}
|
||||
|
||||
kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
|
||||
func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 {
|
||||
var pixelVals []float32
|
||||
|
||||
bounds := img.Bounds()
|
||||
if channelFirst {
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := img.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
var rVal, gVal, bVal float32
|
||||
if rescale {
|
||||
rVal = float32(r>>8) / 255.0
|
||||
gVal = float32(g>>8) / 255.0
|
||||
bVal = float32(b>>8) / 255.0
|
||||
}
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
} else {
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := img.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
var rVal, gVal, bVal float32
|
||||
if rescale {
|
||||
rVal = float32(r>>8) / 255.0
|
||||
gVal = float32(g>>8) / 255.0
|
||||
bVal = float32(b>>8) / 255.0
|
||||
}
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
pixelVals = append(pixelVals, rVal, gVal, bVal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
177
models/imageproc/images_test.go
Normal file
177
models/imageproc/images_test.go
Normal file
@ -0,0 +1,177 @@
|
||||
package imageproc
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
"image/draw"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func createImage(width, height int, fillCol color.RGBA) image.Image {
|
||||
img := image.NewRGBA(image.Rect(0, 0, width, height))
|
||||
draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src)
|
||||
return img
|
||||
}
|
||||
|
||||
func TestComposite(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
img image.Image
|
||||
expectedRGBA color.RGBA
|
||||
}{
|
||||
{
|
||||
name: "Transparent image",
|
||||
img: createImage(5, 5, color.RGBA{0, 0, 0, 0}),
|
||||
expectedRGBA: color.RGBA{255, 255, 255, 255},
|
||||
},
|
||||
{
|
||||
name: "Solid red image",
|
||||
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}),
|
||||
expectedRGBA: color.RGBA{255, 0, 0, 255},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
resultImg := Composite(tt.img)
|
||||
|
||||
// Check the pixel values in the resulting image
|
||||
for x := range resultImg.Bounds().Dx() {
|
||||
for y := range resultImg.Bounds().Dy() {
|
||||
r, g, b, a := resultImg.At(x, y).RGBA()
|
||||
expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA()
|
||||
|
||||
if r != expectedR || g != expectedG || b != expectedB || a != expectedA {
|
||||
t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)",
|
||||
x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestResize(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
img image.Image
|
||||
newSize image.Point
|
||||
method int
|
||||
expected image.Point
|
||||
}{
|
||||
{
|
||||
name: "Resize with bilinear interpolation",
|
||||
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}),
|
||||
newSize: image.Point{10, 10},
|
||||
method: ResizeBilinear,
|
||||
expected: image.Point{10, 10},
|
||||
},
|
||||
{
|
||||
name: "Resize with nearest neighbor",
|
||||
img: createImage(10, 10, color.RGBA{0, 255, 0, 255}),
|
||||
newSize: image.Point{5, 5},
|
||||
method: ResizeNearestNeighbor,
|
||||
expected: image.Point{5, 5},
|
||||
},
|
||||
{
|
||||
name: "Resize with catmullrom",
|
||||
img: createImage(1024, 1024, color.RGBA{0, 0, 255, 255}),
|
||||
newSize: image.Point{10, 10},
|
||||
method: ResizeCatmullrom,
|
||||
expected: image.Point{10, 10},
|
||||
},
|
||||
{
|
||||
name: "Resize with approx bilinear",
|
||||
img: createImage(1024, 768, color.RGBA{100, 100, 100, 255}),
|
||||
newSize: image.Point{4, 3},
|
||||
method: ResizeApproxBilinear,
|
||||
expected: image.Point{4, 3},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
resizedImg := Resize(tt.img, tt.newSize, tt.method)
|
||||
|
||||
if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y {
|
||||
t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)",
|
||||
resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestResizeInvalidMethod(t *testing.T) {
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("Expected panic for invalid resizing method, but did not panic")
|
||||
}
|
||||
}()
|
||||
|
||||
img := createImage(10, 10, color.RGBA{0, 0, 0, 255})
|
||||
Resize(img, image.Point{5, 5}, -1)
|
||||
}
|
||||
|
||||
func TestNormalize(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
img image.Image
|
||||
mean [3]float32
|
||||
std [3]float32
|
||||
rescale bool
|
||||
channelFirst bool
|
||||
expected []float32
|
||||
}{
|
||||
{
|
||||
name: "Rescale with channel first",
|
||||
img: createImage(2, 2, color.RGBA{128, 128, 128, 255}),
|
||||
mean: ImageNetStandardMean,
|
||||
std: ImageNetStandardSTD,
|
||||
rescale: true,
|
||||
channelFirst: true,
|
||||
expected: []float32{
|
||||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values
|
||||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values
|
||||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Rescale without channel first",
|
||||
img: createImage(2, 2, color.RGBA{255, 0, 0, 255}),
|
||||
mean: [3]float32{0.0, 0.0, 0.0},
|
||||
std: [3]float32{1.0, 1.0, 1.0},
|
||||
rescale: true,
|
||||
channelFirst: false,
|
||||
expected: []float32{
|
||||
1.0, 0.0, 0.0,
|
||||
1.0, 0.0, 0.0,
|
||||
1.0, 0.0, 0.0,
|
||||
1.0, 0.0, 0.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "No rescale with mean/std adjustment",
|
||||
img: createImage(2, 2, color.RGBA{100, 150, 200, 255}),
|
||||
mean: ClipDefaultMean,
|
||||
std: ClipDefaultSTD,
|
||||
rescale: false,
|
||||
channelFirst: false,
|
||||
expected: []float32{
|
||||
-1.7922626, -1.7520971, -1.4802198,
|
||||
-1.7922626, -1.7520971, -1.4802198,
|
||||
-1.7922626, -1.7520971, -1.4802198,
|
||||
-1.7922626, -1.7520971, -1.4802198,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst)
|
||||
|
||||
if !reflect.DeepEqual(result, tt.expected) {
|
||||
t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
@ -1,19 +1,20 @@
|
||||
package imageproc
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/color"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"io"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/models/imageproc"
|
||||
)
|
||||
|
||||
func GetSupportedAspectRatios(maxTiles int) []image.Point {
|
||||
func getSupportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int {
|
||||
return a
|
||||
}
|
||||
|
||||
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
|
||||
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
@ -113,6 +94,53 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
if format == "png" {
|
||||
img = imageproc.Composite(img)
|
||||
}
|
||||
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
|
||||
}
|
||||
|
||||
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
@ -134,107 +162,40 @@ func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
func compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
if format == "png" {
|
||||
img = compositeImage(img)
|
||||
}
|
||||
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
// scaling choices:
|
||||
// NearestNeighbor fast, blocky output
|
||||
// ApproxBiLinear fast, medium quality
|
||||
// BiLinear slow, high quality
|
||||
// CatmullRom very slow, very high quality
|
||||
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
||||
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
func packImages(img image.Image, aspectRatio image.Point) []float32 {
|
||||
subImages := splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
rescale := true
|
||||
channelFirst := true
|
||||
|
||||
for _, subImg := range subImages {
|
||||
bounds := subImg.Bounds()
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := subImg.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
rVal := float32(r>>8) / 255.0
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
|
||||
pixelVals = append(pixelVals, vals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func Preprocess(imageData []byte) ([]float32, int, error) {
|
||||
// todo: need guard in here for bad image data
|
||||
|
||||
// mllama values
|
||||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||
outputSize := image.Point{560, 560}
|
||||
maxTiles := 4
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
img, format, err := image.Decode(bytes.NewReader(imageData))
|
||||
img, format, err := image.Decode(imageData)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to decode image: %w", err)
|
||||
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||
}
|
||||
|
||||
newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
|
||||
newImage = PadImage(newImage, outputSize, aspectRatio)
|
||||
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
|
||||
newImage = padImage(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := PackImages(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
||||
data := packImages(newImage, aspectRatio)
|
||||
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
||||
|
||||
return data, aspectRatioIndex, nil
|
||||
opts := map[string]any{
|
||||
"aspectRatioIndex": aspectRatioIndex,
|
||||
}
|
||||
|
||||
return data, opts, nil
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package imageproc
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@ -35,7 +35,7 @@ func TestAspectRatios(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := GetSupportedAspectRatios(c.MaxTiles)
|
||||
actual := getSupportedAspectRatios(c.MaxTiles)
|
||||
|
||||
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
@ -299,7 +299,7 @@ func TestResize(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
|
||||
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
|
||||
|
||||
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
|
||||
@ -329,7 +329,7 @@ func TestPad(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio)
|
||||
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
|
||||
|
||||
if actual.Bounds() != c.Expected.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
|
||||
@ -344,9 +344,6 @@ func TestPackImages(t *testing.T) {
|
||||
ExpectedVals int
|
||||
}
|
||||
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
cases := []packCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||
@ -366,7 +363,7 @@ func TestPackImages(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
|
||||
actualVals := packImages(c.TestImage, c.AspectRatio)
|
||||
if len(actualVals) != c.ExpectedVals {
|
||||
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
|
||||
}
|
||||
@ -400,7 +397,7 @@ func TestPreprocess(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
imgData, aspectRatioID, err := Preprocess(buf.Bytes())
|
||||
imgData, opts, err := Preprocess(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
@ -409,6 +406,13 @@ func TestPreprocess(t *testing.T) {
|
||||
t.Errorf("no image data returned")
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"]
|
||||
if !ok {
|
||||
t.Fatalf("no aspect ratio found")
|
||||
}
|
||||
|
||||
aspectRatioID := ar.(int)
|
||||
|
||||
if aspectRatioID != c.ExpectedAspectRatioID {
|
||||
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
|
||||
}
|
@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/server/imageproc"
|
||||
"github.com/ollama/ollama/models/mllama"
|
||||
"github.com/ollama/ollama/template"
|
||||
)
|
||||
|
||||
@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
var imgData llm.ImageData
|
||||
|
||||
if isMllama {
|
||||
data, aspectRatioID, err := imageproc.Preprocess(i)
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||
}
|
||||
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: buf.Bytes(),
|
||||
AspectRatioID: aspectRatioID,
|
||||
AspectRatioID: ar,
|
||||
}
|
||||
imgPrompt = "<|image|>"
|
||||
} else {
|
||||
|
@ -31,10 +31,10 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/models/mllama"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/runners"
|
||||
"github.com/ollama/ollama/server/imageproc"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
@ -192,12 +192,18 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
images := make([]llm.ImageData, len(req.Images))
|
||||
for i := range req.Images {
|
||||
if isMllama {
|
||||
data, aspectRatioID, err := imageproc.Preprocess(req.Images[i])
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
@ -205,7 +211,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
|
||||
} else {
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user