diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3b50e723..ab10ef29 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -249,6 +249,16 @@ jobs:
         run: make -j 4
       - run: go build .
 
+  validate-openapi:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: stoplightio/spectral-action@latest
+        with:
+          file_glob: 'specs/*.yaml'
+
   lint:
     strategy:
       matrix:
diff --git a/.spectral.yaml b/.spectral.yaml
new file mode 100644
index 00000000..1cac3b3d
--- /dev/null
+++ b/.spectral.yaml
@@ -0,0 +1 @@
+extends: ["spectral:oas", "spectral:asyncapi"]
diff --git a/specs/openapi-3.1.yaml b/specs/openapi-3.1.yaml
new file mode 100644
index 00000000..b26fcfe6
--- /dev/null
+++ b/specs/openapi-3.1.yaml
@@ -0,0 +1,1026 @@
+openapi: 3.1.0
+info:
+  title: Ollama API
+  description: API for interacting with the Ollama service.
+  version: 0.1.44
+  contact:
+    name: Ollama
+    url: https://github.com/ollama/ollama
+servers:
+  - url: http://{host}:{port}
+    description: Ollama API server
+    variables:
+      host:
+        default: 127.0.0.1
+      port:
+        default: '11434'
+
+tags:
+  - name: generate
+    description: Generate responses
+  - name: chat
+    description: Generate chat responses
+  - name: models
+    description: Manage models
+  - name: blobs
+    description: Manage blobs
+  - name: embeddings
+    description: Generate embeddings
+  - name: server
+    description: Server information
+
+paths:
+  /api/generate:
+    post:
+      operationId: generateResponse
+      tags:
+        - generate
+      description: |
+        Generate a response for a given prompt with a provided model. This is 
+        a streaming endpoint, so there will be a series of responses. The 
+        final response object will include statistics and additional data from 
+        the request.
+      summary: |
+        Generate a response for a given prompt with a provided model. This is 
+        a streaming endpoint, so there will be a series of responses. The final 
+        response object will include statistics and additional data from the 
+        request.
+      requestBody:
+        required: true
+        description: Request to generate a response
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GenerateRequest'
+      responses:
+        '200':
+          description: A response was successfully generated for the prompt
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateResponse'
+
+  /api/chat:
+    post:
+      operationId: generateChat
+      tags:
+        - chat
+        - generate
+      description: | 
+        Generate the next message in a chat with a provided model. This is a 
+        streaming endpoint, so there will be a series of responses. Streaming 
+        can be disabled using "stream": false. The final response object will 
+        include statistics and additional data from the request.
+      summary: |
+        Generate the next message in a chat with a provided model. This is a 
+        streaming endpoint, so there will be a series of responses. Streaming 
+        can be disabled using "stream": false. The final response object will 
+        include statistics and additional data from the request.
+      requestBody:
+        required: true
+        description: Request to generate a response in a chat
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ChatRequest'
+      responses:
+        '200':
+          description: The next message was successfully generated for the chat
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ChatResponse'
+                
+  /api/create:
+    post:
+      operationId: createModel
+      tags:
+        - models
+      description: |
+        Create a model from a Modelfile. It is recommended to set modelfile 
+        to the content of the Modelfile rather than just set path. This is a 
+        requirement for remote create. Remote model creation must also create 
+        any file blobs, fields such as FROM and ADAPTER, explicitly with the 
+        server using Create a Blob and the value to the path indicated in the 
+        response.
+      summary: | 
+        Create a model from a Modelfile. It is recommended to set modelfile to 
+        the content of the Modelfile rather than just set path. This is a 
+        requirement for remote create. Remote model creation must also create 
+        any file blobs, fields such as FROM and ADAPTER, explicitly with the 
+        server using Create a Blob and the value to the path indicated in the 
+        response.
+      requestBody:
+        required: true
+        description: Request to create a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateRequest'
+      responses:
+        '200':
+          description: The model was successfully created
+          content:
+            application/x-ndjson:
+              schema:
+                $ref: '#/components/schemas/ProgressResponse'
+
+  /api/blobs/{digest}:
+    get:
+      operationId: getBlob
+      tags:
+        - blobs
+      description: |
+        Ensures that the file blob used for a FROM or ADAPTER field exists on 
+        the server. This is checking your Ollama server and not Ollama.ai.
+      summary: |
+        Ensures that the file blob used for a FROM or ADAPTER field exists on 
+        the server. This is checking your Ollama server and not Ollama.ai.
+      parameters:
+        - name: digest
+          in: path
+          required: true
+          description: The SHA256 digest of the blob
+          schema:
+            type: string
+      responses:
+        '200':
+          description: The blob exists on the server
+        '404':
+          description: The blob does not exist on the server
+    post:
+      operationId: createBlob
+      tags:
+        - blobs
+      description: Create a blob from a file on the server
+      summary: Create a blob from a file on the server
+      parameters:
+        - name: digest
+          in: path
+          required: true
+          description: The SHA256 digest of the blob
+          schema:
+            type: string
+      requestBody:
+        required: true
+        description: The file to create the blob from
+        content:
+          application/octet-stream:
+            schema:
+              type: string
+              format: binary
+      responses:
+        '201':
+          description: Blob was successfully created
+        '400':
+          description: The digest used is not expected
+                
+  /api/tags:
+    get:
+      operationId: getModels
+      tags:
+        - models
+      description: List models that are available locally
+      summary: List models that are available locally
+      responses:
+        '200':
+          description: The models were successfully fetched
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListResponse'
+
+
+  /api/show:
+    post:
+      operationId: showModel
+      tags:
+        - models
+      description: |
+        Show information about a model including details, modelfile, template, 
+        parameters, license, and system prompt.
+      summary: |
+        Show information about a model including details, modelfile, template, 
+        parameters, license, and system prompt.
+      requestBody:
+        required: true
+        description: Request to show a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ShowRequest'
+      responses:
+        '200':
+          description: The model's information was successfully fetched
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ShowResponse'
+
+  /api/copy:
+    post:
+      operationId: copyModel
+      tags:
+        - models
+      description: |
+        Copy a model. Creates a model with another name from an existing model.
+      summary: |
+        Copy a model. Creates a model with another name from an existing model.
+      requestBody:
+        required: true
+        description: Request to copy a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CopyRequest'
+      responses:
+        '200':
+          description: Model was successfully copied
+        '404':
+          description: Source model does not exist
+          
+  /api/delete:
+    delete:
+      operationId: deleteModel
+      tags:
+        - models
+      description: Delete a model and its data
+      summary: 'Delete a model and its data'
+      requestBody:
+        required: true
+        description: Request to delete a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeleteRequest'
+      responses:
+        '200':
+          description: Model was successfully deleted
+        '404':
+          description: Model does not exist
+          
+  /api/pull:
+    post:
+      operationId: pullModel
+      tags:
+        - models
+      description: | 
+        Download a model from the ollama library. Cancelled pulls are resumed 
+        from where they left off, and multiple calls will share the same 
+        download progress.
+      summary: |
+        Download a model from the ollama library. Cancelled pulls are resumed 
+        from where they left off, and multiple calls will share the same 
+        download progress.
+      requestBody:
+        required: true
+        description: Request to pull a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/PullRequest'
+      responses:
+        '200':
+          description: Model was successfully pulled to the server
+          content:
+            application/x-ndjson:
+              schema:
+                $ref: '#/components/schemas/ProgressResponse'
+
+  /api/push:
+    post:
+      operationId: pushModel
+      tags:
+        - models
+      description: |
+        Upload a model to a model library. Requires registering for ollama.ai 
+        and adding a public key first.
+      summary: |
+        Upload a model to a model library. Requires registering for ollama.ai 
+        and adding a public key first.'
+      requestBody:
+        required: true
+        description: Request to push a model
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/PushRequest'
+      responses:
+        '200':
+          description: Model was successfully pushed to the server
+          content:
+            application/x-ndjson:
+              schema:
+                $ref: '#/components/schemas/ProgressResponse'
+
+
+  /api/embeddings:
+    post:
+      operationId: generateEmbeddings
+      tags:
+        - embeddings
+        - generate
+      description: Generate embeddings from a model
+      summary: Generate embeddings from a model
+      requestBody:
+        required: true
+        description: Request to generate embeddings
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EmbeddingRequest'
+      responses:
+        '200':
+          description: The embeddings were successfully generated
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EmbeddingResponse'
+
+
+  /api/ps:
+    get:
+      operationId: getRunningModels
+      tags:
+        - models
+      description: List running models
+      summary: List running models
+      responses:
+        '200':
+          description: The list of running models was successfully fetched
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ProcessResponse'
+
+  /api/version:
+    get:
+      operationId: getOllamaVersion
+      tags:
+        - server
+      description: Return the Ollama server version
+      summary: Return the Ollama server version
+      responses:
+        '200':
+          description: The Ollama server version was successfully fetched
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  version:
+                    type: string
+
+components:
+  schemas:
+    GenerateRequest:
+      type: object
+      description: Request to generate a response
+      properties:
+        model:
+          type: string
+          description: The model name
+        prompt:
+          type: string
+          description: The prompt to generate a response for
+        images:
+          type: array
+          items:
+            type: string
+            format: byte
+          description: |
+            A list of base64-encoded images (for multimodal models such as 
+            llava)        
+        format:
+          type: string
+          description: |
+            The format to return a response in. Currently the only accepted 
+            value is json
+        options:
+          $ref: '#/components/schemas/Options'
+        system:
+          type: string
+          description: |
+            System message to (overrides what is defined in the Modelfile)
+        template:
+          type: string
+          description: |
+            The prompt template to use (overrides what is defined in the 
+            Modelfile)
+        context:
+          type: array
+          items:
+            type: integer
+          description: |
+            The context parameter returned from a previous request to generate, 
+            this can be used to keep a short conversational memory
+          example: []
+        stream:
+          type: boolean
+          description: |
+            If false the response will be returned as a single response object, 
+            rather than a stream of objects
+        raw:
+          type: boolean
+          description: |
+            If true no formatting will be applied to the prompt. You may choose 
+            to use the raw parameter if you are specifying a full templated 
+            prompt in your request to the API
+        keep_alive:
+          $ref: '#/components/schemas/Duration'
+      required:
+        - model
+        - prompt
+
+    GenerateResponse:
+      type: object
+      description: Response from a generate request
+      properties:
+        model:
+          type: string
+          description: The model name that generated the response
+        created_at:
+          type: string
+          format: date-time
+          description: Timestamp of the response
+        response:
+          type: string
+          description: |
+            The textual response itself. When done, empty if the response was 
+            streamed, if not streamed, this will contain the full response
+        done:
+          type: boolean
+          description: Specifies if the response is complete
+        context:
+          type: array
+          items:
+            type: integer
+          description: |
+            When done, encoding of the conversation used in this response
+        total_duration:
+          type: number
+          description: When done, time spent generating the response
+        load_duration:
+          type: number
+          description: When done, time spent in nanoseconds loading the model
+        prompt_eval_count:
+          type: integer
+          description: When done, number of tokens in the prompt
+        prompt_eval_duration:
+          type: number
+          description: |
+            When done, time spent in nanoseconds evaluating the prompt
+        eval_count:
+          type: integer
+          description: When done, number of tokens in the response
+        eval_duration:
+          type: number
+          description: |
+            When done, time in nanoseconds spent generating the response 
+
+    ChatRequest:
+      type: object
+      description: Request to generate a response in a chat
+      properties:
+        model:
+          type: string
+          description: The model name
+        messages:
+          type: array
+          items:
+            $ref: '#/components/schemas/Message'
+          description: Messages of the chat - can be used to keep a chat memory
+        stream:
+          type: boolean
+          description: Enable streaming of returned response
+        format:
+          type: string
+          description: Format to return the response in (e.g. "json")
+        keep_alive:
+          $ref: '#/components/schemas/Duration'
+        options:
+          $ref: '#/components/schemas/Options'
+
+    ChatResponse:
+      type: object
+      description: Response from a chat request
+      properties:
+        model:
+          type: string
+          description: The model name
+        created_at:
+          type: string
+          format: date-time
+          description: Timestamp of the response
+        message:
+          $ref: '#/components/schemas/Message'
+        done_reason:
+          type: string
+          description: Reason the model stopped generating text
+        done:
+          type: boolean
+          description: Specifies if the response is complete
+        total_duration:
+          type: number
+          description: Total duration of the request
+        load_duration:
+          type: number
+          description: Load duration of the request
+        prompt_eval_count:
+          type: integer
+          description: Count of prompt evaluations
+        prompt_eval_duration:
+          type: number
+          description: Duration of prompt evaluations
+        eval_count:
+          type: integer
+          description: Count of evaluations
+        eval_duration:
+          type: number
+          description: Duration of evaluations
+
+    CreateRequest:
+      type: object
+      description: Request to create a model
+      properties:
+        model:
+          type: string
+          description: The name of the model to create
+          example: mario
+        path:
+          type: string
+          description: The path to the model file
+        modelfile:
+          type: string
+          description: The modelfile content
+          example: FROM llama3\nSYSTEM You are mario from Super Mario Bros.
+        stream:
+          type: boolean
+          description: |
+            If false the response will be returned as a single response object, 
+            rather than a stream of objects
+        quantize:
+          type: string
+          description: Specifies the quantization level of the model
+      required:
+        - model
+        
+    ListResponse:
+      type: object
+      description: Response from a list request
+      properties:
+        models:
+          type: array
+          items:
+            $ref: '#/components/schemas/ListModelResponse'
+
+    ListModelResponse:
+      type: object
+      description: Response from a list request
+      properties:
+        name:
+          type: string
+        model:
+          type: string
+        modified_at:
+          type: string
+          format: date-time
+        size:
+          type: integer
+        digest:
+          type: string
+        details:
+          $ref: '#/components/schemas/ModelDetails'
+        
+    ShowRequest:
+      type: object
+      description: Request to show a model
+      properties:
+        model:
+          type: string
+          description: The name of the model to show
+      required:
+        - model
+
+    ShowResponse:
+      type: object
+      description: Response from a show request
+      properties:
+        license:
+          type: string
+          description: The model license
+        modelfile:
+          type: string
+          description: The modelfile content
+        parameters:
+          type: string
+          description: The model parameters
+        template:
+          type: string
+          description: The model template
+        system:
+          type: string
+          description: The model system message/prompt
+        details:
+          $ref: '#/components/schemas/ModelDetails'
+        messages:
+          type: array
+          items:
+            $ref: '#/components/schemas/Message'
+            
+    CopyRequest:
+      type: object
+      description: Request to copy a model
+      properties:
+        source:
+          type: string
+        destination:
+          type: string
+          
+    DeleteRequest:
+      type: object
+      description: Request to delete a model
+      properties:
+        model:
+          type: string
+          description: The name of the model to delete
+      required:
+        - model
+
+    PullRequest:
+      type: object
+      description: Request to pull a model
+      properties:
+        model:
+          type: string
+          description: The name of the model to pull
+          example: llama3
+        insecure:
+          type: boolean
+          description: |
+            allow insecure connections to the library. Only use this if you are
+             pulling from your own library during development.
+        stream:
+          type: boolean
+          description: |
+            If false the response will be returned as a single response object, 
+            rather than a stream of objects
+      required: 
+        - model
+
+    PushRequest:
+      type: object
+      description: Request to push a model
+      properties:
+        model:
+          type: string
+          description: |  
+            The name of the model to push in the form of <namespace>/<model>:<tag>
+        insecure:
+          type: boolean
+          description: |
+            Whether to allow insecure connections to the library. Only use this 
+            if you are pushing to your library during development
+        stream:
+          type: boolean
+          description: |
+            If false the response will be returned as a single response object, 
+            rather than a stream of objects
+      required: 
+        - model
+        
+    ProgressResponse:
+      type: object
+      description: The response returned from various streaming endpoints
+      properties:
+        status:
+          type: string
+          description: The status of the request
+        digest:
+          type: string
+          description: The SHA256 digest of the blob
+        total:
+          type: integer
+          description: The total size of the task
+        completed:
+          type: integer
+          description: The completed size of the task
+
+    EmbeddingRequest:
+      type: object
+      description: Request to generate embeddings
+      properties:
+        model:
+          type: string
+          description: The name of model to generate embeddings from
+        prompt:
+          type: string
+          description: The text to generate embeddings for
+        keep_alive:
+          $ref: '#/components/schemas/Duration'
+        options:
+          $ref: '#/components/schemas/Options'
+      required:
+        - model
+        - prompt
+
+    EmbeddingResponse:
+      type: object
+      description: Response from an embedding request
+      properties:
+        embedding:
+          type: array
+          items:
+            type: number
+          description: The generated embeddings
+          
+    ProcessResponse:
+      type: object
+      description: Response with a list of running models
+      properties:
+        models:
+          type: array
+          items:
+            $ref: '#/components/schemas/ProcessModelResponse'
+
+    ProcessModelResponse:
+      type: object
+      description: Running model description
+      properties:
+        name:
+          type: string
+        model:
+          type: string
+        size:
+          type: integer
+        digest:
+          type: string
+        details:
+          $ref: '#/components/schemas/ModelDetails'
+        expires_at:
+          type: string
+          format: date-time
+        size_vram:
+          type: integer
+
+    Message:
+      type: object
+      description: A message in a chat
+      properties:
+        role:
+          type: string
+        content:
+          type: string
+        images:
+          type: array
+          items:
+            type: string
+            format: byte
+
+    ModelDetails:
+      type: object
+      description: Details about a model
+      properties:
+        parent_model:
+          type: string
+        format:
+          type: string
+        family:
+          type: string
+        families:
+          type: array
+          items:
+            type: string
+        parameter_size:
+          type: string
+        quantization_level:
+          type: string
+
+    Duration:
+      type: string
+      description: A string representing the duration
+      example: "5m"
+
+    Options:
+      type: object
+      description: |
+        Advanced model and runner options for generation and chat requests
+      properties:
+        num_keep:
+          type: integer
+          description: | 
+            Specifies the number of tokens from the beginning of 
+            the context ot retain when the context limit is reached. 
+            (Default: 4)
+          example: 4
+        seed:
+          type: integer
+          description: |
+            Sets the random number seed to use for generation. Setting this to 
+            a specific number will make the model generate the same text for 
+            the same prompt. 
+            (Default: 0)
+          example: -1
+        num_predict:
+          type: integer
+          description: |
+            Maximum number of tokens to predict when generating text. 
+            (Default: 128, -1 = infinite generation, -2 = fill context)
+          example: -1
+        top_k:
+          type: integer
+          description: |
+            Reduces the probability of generating nonsense. A higher value 
+            (e.g. 100) will give more diverse answers, while a lower value 
+            (e.g. 10) will be more conservative. 
+            (Default: 40)
+          example: 40
+        top_p:
+          type: number
+          format: float
+          description: |
+            Works together with top-k. A higher value (e.g., 0.95) will lead to 
+            more diverse text, while a lower value (e.g., 0.5) will generate 
+            more focused and conservative text. 
+            (Default: 0.9)
+          example: 0.9
+        tfs_z:
+          type: number
+          format: float
+          description: |
+            Tail free sampling is used to reduce the impact of less probable 
+            tokens from the output. A higher value (e.g., 2.0) will reduce the 
+            impact more, while a value of 1.0 disables this setting. 
+            (default: 1)
+          example: 1.0
+        typical_p:
+          type: number
+          format: float
+          description: |
+            Controls the selection of typical words based on their probability 
+            distribution. A higher value (e.g., 0.95) focuses on more typical 
+            words, reducing the chance of unusual words being selected. 
+            (Default: 1.0)
+          example: 1.0
+        repeat_last_n:
+          type: integer
+          description: |
+            Sets how far back for the model to look back to prevent repetition. 
+            (Default: 64, 0 = disabled, -1 = num_ctx)
+          example: 64
+        temperature:
+          type: number
+          format: float
+          description: |
+            The temperature of the model. Increasing the temperature will make 
+            the model answer more creatively. 
+            (Default: 0.8)
+          example: 0.8
+        repeat_penalty:
+          type: number
+          format: float
+          description: |
+            Sets how strongly to penalize repetitions. A higher value 
+            (e.g., 1.5) will penalize repetitions more strongly, while a lower 
+            value (e.g., 0.9) will be more lenient. 
+            (Default: 1.1)
+          example: 1.1
+        presence_penalty:
+          type: number
+          format: float
+          description: |
+            Applies a penalty to tokens that have already appeared in the 
+            generated text, encouraging the model to introduce new tokens. A 
+            higher value increases this penalty, promoting more varied and less 
+            repetitive output. 
+            (Default: 0.8)
+          example: 0.8
+        frequency_penalty:
+          type: number
+          format: float
+          description: |
+            Penalizes tokens based on their frequency in the generated text so 
+            far. A higher value reduces the likelihood of frequent tokens being 
+            generated again, promoting more diverse outputs. 
+            (Default: 0.8)
+          example: 0.8
+        mirostat:
+          type: number
+          format: float
+          description: |
+            Enable Mirostat sampling for controlling perplexity. 
+            (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
+          example: 0
+        mirostat_tau:
+          type: number
+          format: float
+          description: |
+            Controls the balance between coherence and diversity of the output.
+            A lower value will result in more focused and coherent text. 
+            (Default: 5.0)
+          example: 5.8
+        mirostat_eta:
+          type: number
+          format: float
+          description: | 
+            Influences how quickly the algorithm responds to feedback from the 
+            generated text. A lower learning rate will result in slower 
+            adjustments, while a higher learning rate will make the algorithm 
+            more responsive. 
+            (Default: 0.1)
+          example: 0.1
+        penalize_newline:
+          type: boolean
+          description: |
+            Determines whether the model should penalize the generation of 
+            newlines, which can help control the structure and formatting of 
+            the output. 
+            (Default: true)
+          example: true
+        stop:
+          type: array
+          items:
+            type: string
+          description: |
+            Sets the stop sequences to use. When this pattern is encountered 
+            the LLM will stop generating text and return. Multiple stop patterns 
+            may be set by specifying multiple separate stop parameters in a 
+            modelfile.
+          example: ['AI assistant.']
+        numa:
+          type: boolean
+          description: |
+            Indicates whether to use Non-Uniform Memory Access (NUMA) for 
+            optimizing memory usage and performance on multi-processor systems. 
+            (Default: false)
+          example: false
+        num_ctx:
+          type: integer
+          description: |
+            Sets the size of the context window used to generate the next token. 
+            (Default: 2048)
+          example: 2048
+        num_batch:
+          type: integer
+          description: |
+            Specifies the number of batches for processing. 
+            (Default: 512)
+          example: 512
+        num_gpu:
+          type: integer
+          description: |
+            Specifies the number of GPUs to use. A value of -1 uses all 
+            available GPUs. 
+            (Default: -1)
+          example: -1
+        main_gpu:
+          type: integer
+          description: |
+            Specifies the primary GPU to use for processing. 
+            (Default: 0)
+        low_vram:
+          type: boolean
+          description: | 
+            Indicates whether to optimize the model for low VRAM usage. 
+            (Default: false)
+          example: false
+        f16_kv:
+          type: boolean
+          description: |
+            Indicates whether to use 16-bit floating point precision for 
+            key-value pairs, reducing memory usage. 
+            (Default: false)
+          example: true
+        logits_all:
+          type: boolean
+          description: |
+            Specifies whether to output logits for all tokens. 
+            (Default: false)
+          example: false
+        vocab_only:
+          type: boolean
+          description: |
+            Indicates whether to only load the vocabulary without the full model. 
+            (Default: false)
+          example: false
+        use_mmap:
+          type: boolean
+          description: |
+            Determines whether to use memory-mapped files for loading the model, 
+            improving performance on large models. 
+            (Default: true)
+          example: true
+        use_mlock:
+          type: boolean
+          description: |
+            Determines whether to use memory locking to prevent swapping the 
+            model out of RAM. 
+            (Default: false)
+          example: false
+        num_thread:
+          type: integer
+          description: |
+            Specifies the number of threads to use for processing. A value of 
+            0 uses all available threads. 
+            (Default: 0)
+          example: 0