From 5ffbbea1d77278dcbb94f42da3fc48a0beb9b355 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 11 Jan 2024 15:51:47 -0800
Subject: [PATCH 01/12] remove client.py

---
 api/client.py | 284 --------------------------------------------------
 1 file changed, 284 deletions(-)
 delete mode 100644 api/client.py

diff --git a/api/client.py b/api/client.py
deleted file mode 100644
index a2eb50e8..00000000
--- a/api/client.py
+++ /dev/null
@@ -1,284 +0,0 @@
-import os
-import json
-import requests
-import os
-import hashlib
-import json
-from pathlib import Path
-
-BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
-
-# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
-# The final response object will include statistics and additional data from the request. Use the callback function to override
-# the default handler.
-def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
-    try:
-        url = f"{BASE_URL}/api/generate"
-        payload = {
-            "model": model_name, 
-            "prompt": prompt, 
-            "system": system, 
-            "template": template, 
-            "context": context, 
-            "options": options,
-            "format": format,
-        }
-        
-        # Remove keys with None values
-        payload = {k: v for k, v in payload.items() if v is not None}
-        
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-            
-            # Creating a variable to hold the context history of the final chunk
-            final_context = None
-            
-            # Variable to hold concatenated response strings if no callback is provided
-            full_response = ""
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-                    
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # If this is not the last chunk, add the "response" field value to full_response and print it
-                        if not chunk.get("done"):
-                            response_piece = chunk.get("response", "")
-                            full_response += response_piece
-                            print(response_piece, end="", flush=True)
-                    
-                    # Check if it's the last chunk (done is true)
-                    if chunk.get("done"):
-                        final_context = chunk.get("context")
-            
-            # Return the full response and the final context
-            return full_response, final_context
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None, None
-    
-
-# Create a blob file on the server if it doesn't exist.
-def create_blob(digest, file_path):
-    url = f"{BASE_URL}/api/blobs/{digest}"
-
-    # Check if the blob exists
-    response = requests.head(url)
-    if response.status_code != 404:
-        return  # Blob already exists, no need to upload
-    response.raise_for_status()
-
-    # Upload the blob
-    with open(file_path, 'rb') as file_data:
-        requests.post(url, data=file_data)
-
-
-# Create a model from a Modelfile. Use the callback function to override the default handler.
-def create(model_name, filename, callback=None):
-    try:
-        file_path = Path(filename).expanduser().resolve()
-        processed_lines = []
-
-        # Read and process the modelfile
-        with open(file_path, 'r') as f:
-            for line in f:            
-                # Skip empty or whitespace-only lines
-                if not line.strip():
-                    continue
-            
-                command, args = line.split(maxsplit=1)
-
-                if command.upper() in ["FROM", "ADAPTER"]:
-                    path = Path(args.strip()).expanduser()
-
-                    # Check if path is relative and resolve it
-                    if not path.is_absolute():
-                        path = (file_path.parent / path)
-
-                    # Skip if file does not exist for "model", this is handled by the server
-                    if not path.exists():
-                        processed_lines.append(line)
-                        continue
-
-                    # Calculate SHA-256 hash
-                    with open(path, 'rb') as bin_file:
-                        hash = hashlib.sha256()
-                        hash.update(bin_file.read())
-                        blob = f"sha256:{hash.hexdigest()}"
-                
-                    # Add the file to the remote server
-                    create_blob(blob, path)
-
-                    # Replace path with digest in the line
-                    line = f"{command} @{blob}\n"
-
-                processed_lines.append(line)
-
-        # Combine processed lines back into a single string
-        modelfile_content = '\n'.join(processed_lines)
-
-        url = f"{BASE_URL}/api/create"
-        payload = {"name": model_name, "modelfile": modelfile_content}
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-            # Iterating over the response line by line and displaying the status
-            for line in response.iter_lines():
-                if line:
-                    chunk = json.loads(line)
-                    if callback:
-                        callback(chunk)
-                    else:
-                        print(f"Status: {chunk.get('status')}")
-
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-
-# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
-# calls to will share the same download progress. Use the callback function to override the default handler.
-def pull(model_name, insecure=False, callback=None):
-    try:
-        url = f"{BASE_URL}/api/pull"
-        payload = {
-            "name": model_name,
-            "insecure": insecure
-        }
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # Print the status message directly to the console
-                        print(chunk.get('status', ''), end='', flush=True)
-                    
-                    # If there's layer data, you might also want to print that (adjust as necessary)
-                    if 'digest' in chunk:
-                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
-                        print(f" - Total: {chunk['total']}", end='', flush=True)
-                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
-                    else:
-                        print()
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-# Push a model to the model registry. Use the callback function to override the default handler.
-def push(model_name, insecure=False, callback=None):
-    try:
-        url = f"{BASE_URL}/api/push"
-        payload = {
-            "name": model_name,
-            "insecure": insecure
-        }
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # Print the status message directly to the console
-                        print(chunk.get('status', ''), end='', flush=True)
-                    
-                    # If there's layer data, you might also want to print that (adjust as necessary)
-                    if 'digest' in chunk:
-                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
-                        print(f" - Total: {chunk['total']}", end='', flush=True)
-                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
-                    else:
-                        print()
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-# List models that are available locally.
-def list():
-    try:
-        response = requests.get(f"{BASE_URL}/api/tags")
-        response.raise_for_status()
-        data = response.json()
-        models = data.get('models', [])
-        return models
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Copy a model. Creates a model with another name from an existing model.
-def copy(source, destination):
-    try:
-        # Create the JSON payload
-        payload = {
-            "source": source,
-            "destination": destination
-        }
-        
-        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
-        response.raise_for_status()
-        
-        # If the request was successful, return a message indicating that the copy was successful
-        return "Copy successful"
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Delete a model and its data.
-def delete(model_name):
-    try:
-        url = f"{BASE_URL}/api/delete"
-        payload = {"name": model_name}
-        response = requests.delete(url, json=payload)
-        response.raise_for_status()
-        return "Delete successful"
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Show info about a model.
-def show(model_name):
-    try:
-        url = f"{BASE_URL}/api/show"
-        payload = {"name": model_name}
-        response = requests.post(url, json=payload)
-        response.raise_for_status()
-        
-        # Parse the JSON response and return it
-        data = response.json()
-        return data
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-def heartbeat():
-    try:
-        url = f"{BASE_URL}/"
-        response = requests.head(url)
-        response.raise_for_status()
-        return "Ollama is running"
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return "Ollama is not running"

From cbe20c43754b42335ba517e513727bb0f0db74d5 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 11 Jan 2024 16:24:37 -0800
Subject: [PATCH 02/12] update readme

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f7318a3..9fd6e6ed 100644
--- a/README.md
+++ b/README.md
@@ -248,6 +248,10 @@ curl http://localhost:11434/api/chat -d '{
 
 See the [API documentation](./docs/api.md) for all endpoints.
 
+## Official Integrations
+
+- [ollama-python](https://github.com/jmorganca/ollama-python)
+
 ## Community Integrations
 
 ### Web & Desktop

From a70262c6b20b3b18559784bb41303737702aad33 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 12 Jan 2024 09:43:04 -0800
Subject: [PATCH 03/12] Update README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9fd6e6ed..9bf9b8ae 100644
--- a/README.md
+++ b/README.md
@@ -248,7 +248,7 @@ curl http://localhost:11434/api/chat -d '{
 
 See the [API documentation](./docs/api.md) for all endpoints.
 
-## Official Integrations
+## Integrations
 
 - [ollama-python](https://github.com/jmorganca/ollama-python)
 

From 905862e17b246f728d300232c1d3aec28ed65fa1 Mon Sep 17 00:00:00 2001
From: Fabian Preiss <fpreiss@digon.io>
Date: Tue, 9 Jan 2024 21:55:36 +0100
Subject: [PATCH 04/12] improve cuda detection (rel. issue #1704)

---
 llm/generate/gen_linux.sh | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 0c940ba5..7c571384 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -39,8 +39,13 @@ amdGPUs() {
 }
 
 echo "Starting linux generate script"
-if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
-    export CUDACXX=/usr/local/cuda/bin/nvcc
+if [ -z "${CUDACXX}" ]; then
+    if [ -x /usr/local/cuda/bin/nvcc ]; then
+        export CUDACXX=/usr/local/cuda/bin/nvcc
+    else
+        # Try the default location in case it exists
+        export CUDACXX=$(command -v nvcc)
+    fi
 fi
 COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
 source $(dirname $0)/gen_common.sh
@@ -109,16 +114,20 @@ else
     echo "Skipping CPU generation step as requested"
 fi
 
-if [ -d /usr/local/cuda/lib64/ ]; then
+if [ -z "${CUDA_LIB_DIR}" ]; then
+    # Try the default location in case it exists
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+fi
+
+if [ -d "${CUDA_LIB_DIR}" ]; then
     echo "CUDA libraries detected - building dynamic CUDA library"
     init_vars
-    CUDA_MAJOR=$(ls /usr/local/cuda/lib64/libcudart.so.* | head -1 | cut -f3 -d. || true)
+    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
     if [ -n "${CUDA_MAJOR}" ]; then
         CUDA_VARIANT=_v${CUDA_MAJOR}
     fi
     CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
     BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
-    CUDA_LIB_DIR=/usr/local/cuda/lib64
     build
     install
     gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \

From cf29bd2d72f70170ef7b5adf6d3e30ac6a23331c Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 12 Jan 2024 13:32:24 -0800
Subject: [PATCH 05/12] fix: request retry with error

this fixes a subtle bug with makeRequestWithRetry where an HTTP status
error on a retried request will potentially not return the right err
---
 server/images.go | 65 +++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/server/images.go b/server/images.go
index 4742a363..2e00cace 100644
--- a/server/images.go
+++ b/server/images.go
@@ -1132,49 +1132,46 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 var errUnauthorized = fmt.Errorf("unauthorized")
 
 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *RegistryOptions) (*http.Response, error) {
-	resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
-	if err != nil {
-		if !errors.Is(err, context.Canceled) {
-			log.Printf("request failed: %v", err)
-		}
-
-		return nil, err
-	}
-
-	switch {
-	case resp.StatusCode == http.StatusUnauthorized:
-		// Handle authentication error with one retry
-		auth := resp.Header.Get("www-authenticate")
-		authRedir := ParseAuthRedirectString(auth)
-		token, err := getAuthToken(ctx, authRedir)
+	for i := 0; i < 2; i++ {
+		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
 		if err != nil {
+			if !errors.Is(err, context.Canceled) {
+				log.Printf("request failed: %v", err)
+			}
+
 			return nil, err
 		}
-		regOpts.Token = token
-		if body != nil {
-			_, err = body.Seek(0, io.SeekStart)
+
+		switch {
+		case resp.StatusCode == http.StatusUnauthorized:
+			// Handle authentication error with one retry
+			auth := resp.Header.Get("www-authenticate")
+			authRedir := ParseAuthRedirectString(auth)
+			token, err := getAuthToken(ctx, authRedir)
 			if err != nil {
 				return nil, err
 			}
+			regOpts.Token = token
+			if body != nil {
+				_, err = body.Seek(0, io.SeekStart)
+				if err != nil {
+					return nil, err
+				}
+			}
+		case resp.StatusCode == http.StatusNotFound:
+			return nil, os.ErrNotExist
+		case resp.StatusCode >= http.StatusBadRequest:
+			responseBody, err := io.ReadAll(resp.Body)
+			if err != nil {
+				return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
+			}
+			return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
+		default:
+			return resp, nil
 		}
-
-		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
-		if resp.StatusCode == http.StatusUnauthorized {
-			return nil, errUnauthorized
-		}
-
-		return resp, err
-	case resp.StatusCode == http.StatusNotFound:
-		return nil, os.ErrNotExist
-	case resp.StatusCode >= http.StatusBadRequest:
-		responseBody, err := io.ReadAll(resp.Body)
-		if err != nil {
-			return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
-		}
-		return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
 	}
 
-	return resp, nil
+	return nil, errUnauthorized
 }
 
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {

From 2ecb2472769ba0bb364aeeb109659750ad8153ce Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 13 Jan 2024 14:46:34 -0800
Subject: [PATCH 06/12] Fix intel mac build

Make sure we're building an x86 ext_server lib when cross-compiling
---
 llm/dyn_ext_server.go      | 2 +-
 llm/generate/gen_darwin.sh | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go
index 105df634..797222c5 100644
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -75,7 +75,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	updatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
-	resp := newExtServerResp(128)
+	resp := newExtServerResp(512)
 	defer freeExtServerResp(resp)
 	var srv C.struct_dynamic_llama_server
 	C.dyn_init(libPath, &srv, &resp)
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index b7f1f684..6dc09987 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -14,9 +14,11 @@ BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
 case "${GOARCH}" in
 "amd64")
     CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    ARCH="x86_64"
     ;;
 "arm64")
     CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
+    ARHC="arm64"
     ;;
 *)
     echo "GOARCH must be set"
@@ -30,6 +32,7 @@ apply_patches
 build
 install
 gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+    -arch ${ARCH} \
     -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
     ${BUILD_DIR}/lib/libcommon.a \
     ${BUILD_DIR}/lib/libllama.a \

From 3ca5f69ce889c4ba16086fbcfb388c4c940aa421 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 14 Jan 2024 08:32:57 -0800
Subject: [PATCH 07/12] Fix typo in arm mac arch script

---
 llm/generate/gen_darwin.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 6dc09987..3a57d0cb 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ case "${GOARCH}" in
     ;;
 "arm64")
     CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    ARHC="arm64"
+    ARCH="arm64"
     ;;
 *)
     echo "GOARCH must be set"

From b3035112a113bcf609e2bf79f71f33f35863f3e3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@hiltgen.com>
Date: Sun, 14 Jan 2024 09:19:45 -0800
Subject: [PATCH 08/12] Add macos cross-compile CI coverage

---
 .github/workflows/test.yaml | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4ffab937..441a66e2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -8,7 +8,15 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64, arm64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
     runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v4
@@ -33,7 +41,7 @@ jobs:
       - run: go generate -x ./...
       - uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.os }}-libraries
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
           path: |
             llm/llama.cpp/build/**/lib/*
   lint:
@@ -41,7 +49,18 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64, arm64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
+          - os: macos-latest
+            arch: amd64
     runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
+      CGO_ENABLED: "1"
     steps:
       - uses: actions/checkout@v4
         with:
@@ -52,7 +71,7 @@ jobs:
           cache: false
       - uses: actions/download-artifact@v4
         with:
-          name: ${{ matrix.os }}-libraries
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
           path: llm/llama.cpp/build
       - uses: golangci/golangci-lint-action@v3
   test:
@@ -60,6 +79,12 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64, arm64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -72,7 +97,7 @@ jobs:
       - run: go get
       - uses: actions/download-artifact@v4
         with:
-          name: ${{ matrix.os }}-libraries
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
           path: llm/llama.cpp/build
       - run: go build
       - run: go test -v ./...

From eef50accb425a1815625f496bc5b24d5211d5610 Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Tue, 16 Jan 2024 10:34:44 -0800
Subject: [PATCH 09/12] Fix show parameters (#2017)

---
 server/routes.go      | 22 +++-------------------
 server/routes_test.go | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 72c0d051..d76d4b4e 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -15,7 +15,6 @@ import (
 	"path/filepath"
 	"reflect"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -668,27 +667,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	cs := 30
 	for k, v := range model.Options {
 		switch val := v.(type) {
-		case string:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
-		case int:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
-		case float64:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
-		case bool:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
 		case []interface{}:
 			for _, nv := range val {
-				switch nval := nv.(type) {
-				case string:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
-				case int:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
-				case float64:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
-				case bool:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
-				}
+				params = append(params, fmt.Sprintf("%-*s %#v", cs, k, nv))
 			}
+		default:
+			params = append(params, fmt.Sprintf("%-*s %#v", cs, k, v))
 		}
 	}
 	resp.Parameters = strings.Join(params, "\n")
diff --git a/server/routes_test.go b/server/routes_test.go
index aa561d98..b2d93958 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -9,6 +9,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"sort"
 	"strings"
 	"testing"
 
@@ -50,7 +51,7 @@ func Test_Routes(t *testing.T) {
 	createTestModel := func(t *testing.T, name string) {
 		fname := createTestFile(t, "ollama-model")
 
-		modelfile := strings.NewReader(fmt.Sprintf("FROM %s", fname))
+		modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
 		commands, err := parser.Parse(modelfile)
 		assert.Nil(t, err)
 		fn := func(resp api.ProgressResponse) {
@@ -167,6 +168,42 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "beefsteak:latest", model.ShortName)
 			},
 		},
+		{
+			Name:   "Show Model Handler",
+			Method: http.MethodPost,
+			Path:   "/api/show",
+			Setup: func(t *testing.T, req *http.Request) {
+				createTestModel(t, "show-model")
+				showReq := api.ShowRequest{Model: "show-model"}
+				jsonData, err := json.Marshal(showReq)
+				assert.Nil(t, err)
+				req.Body = io.NopCloser(bytes.NewReader(jsonData))
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				assert.Equal(t, contentType, "application/json; charset=utf-8")
+				body, err := io.ReadAll(resp.Body)
+				assert.Nil(t, err)
+
+				var showResp api.ShowResponse
+				err = json.Unmarshal(body, &showResp)
+				assert.Nil(t, err)
+
+				var params []string
+				paramsSplit := strings.Split(showResp.Parameters, "\n")
+				for _, p := range paramsSplit {
+					params = append(params, strings.Join(strings.Fields(p), " "))
+				}
+				sort.Strings(params)
+				expectedParams := []string{
+					"seed 42",
+					"stop \"bar\"",
+					"stop \"foo\"",
+					"top_p 0.9",
+				}
+				assert.Equal(t, expectedParams, params)
+			},
+		},
 	}
 
 	s, err := setupServer(t)

From a897e833b8f772a59675f75cafb8169601e36b7a Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 16 Jan 2024 13:48:05 -0500
Subject: [PATCH 10/12] do not cache prompt (#2018)

- prompt cache causes inferance to hang after some time
---
 llm/dyn_ext_server.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go
index 797222c5..fa0d7750 100644
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -181,7 +181,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
 		"image_data":        imageData,
-		"cache_prompt":      true,
 	}
 
 	if predict.Format == "json" {

From 795674dd9078e21ea7bbe3587d39f26e9351c8b0 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 10 Jan 2024 15:52:35 -0800
Subject: [PATCH 11/12] Bump llama.cpp to b1842 and add new cuda lib dep

Upstream llama.cpp has added a new dependency with the
NVIDIA CUDA Driver Libraries (libcuda.so) which is part of the
driver distribution, not the general cuda libraries, and is not
available as an archive, so we can not statically link it.  This may
introduce some additional compatibility challenges which we'll
need to keep an eye on.
---
 llm/generate/gen_linux.sh | 1 +
 llm/llama.cpp             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 7c571384..10ca450b 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -141,6 +141,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
         ${CUDA_LIB_DIR}/libcublasLt_static.a \
         ${CUDA_LIB_DIR}/libcudadevrt.a \
         ${CUDA_LIB_DIR}/libculibos.a \
+        -lcuda \
         -lrt -lpthread -ldl -lstdc++ -lm
 fi
 
diff --git a/llm/llama.cpp b/llm/llama.cpp
index 328b83de..584d674b 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1
+Subproject commit 584d674be622fbf1578694ada6e62eebedbfd377

From 96cfb626415cd811ab545c134bd0d16fa7aca044 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 16 Jan 2024 16:48:05 -0800
Subject: [PATCH 12/12] fix: normalize name path before splitting

---
 server/modelpath.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/server/modelpath.go b/server/modelpath.go
index f09ff8e9..af3f36ab 100644
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -46,6 +46,7 @@ func ParseModelPath(name string) ModelPath {
 		name = after
 	}
 
+	name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
 	parts := strings.Split(name, "/")
 	switch len(parts) {
 	case 3: