ollama/specs/openapi-3.1.yaml

openapi: 3.1.0
info:
  title: Ollama API
  description: API for interacting with the Ollama service.
  version: 0.1.44
  contact:
    name: Ollama
    url: https://github.com/ollama/ollama
servers:
  - url: http://{host}:{port}
    description: Ollama API server
    variables:
      host:
        default: 127.0.0.1
      port:
        default: '11434'

tags:
  - name: generate
    description: Generate responses
  - name: chat
    description: Generate chat responses
  - name: models
    description: Manage models
  - name: blobs
    description: Manage blobs
  - name: embeddings
    description: Generate embeddings
  - name: server
    description: Server information

paths:
  /api/generate:
    post:
      operationId: generateResponse
      tags:
        - generate
      description: |
        Generate a response for a given prompt with a provided model. This is
        a streaming endpoint, so there will be a series of responses. The
        final response object will include statistics and additional data from
        the request.
      summary: |
        Generate a response for a given prompt with a provided model. This is
        a streaming endpoint, so there will be a series of responses. The final
        response object will include statistics and additional data from the
        request.
      requestBody:
        required: true
        description: Request to generate a response
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateRequest'
      responses:
        '200':
          description: A response was successfully generated for the prompt
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GenerateResponse'

  /api/chat:
    post:
      operationId: generateChat
      tags:
        - chat
        - generate
      description: |
        Generate the next message in a chat with a provided model. This is a
        streaming endpoint, so there will be a series of responses. Streaming
        can be disabled using "stream": false. The final response object will
        include statistics and additional data from the request.
      summary: |
        Generate the next message in a chat with a provided model. This is a
        streaming endpoint, so there will be a series of responses. Streaming
        can be disabled using "stream": false. The final response object will
        include statistics and additional data from the request.
      requestBody:
        required: true
        description: Request to generate a response in a chat
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ChatRequest'
      responses:
        '200':
          description: The next message was successfully generated for the chat
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ChatResponse'

  /api/create:
    post:
      operationId: createModel
      tags:
        - models
      description: |
        Create a model from a Modelfile. It is recommended to set modelfile
        to the content of the Modelfile rather than just set path. This is a
        requirement for remote create. Remote model creation must also create
        any file blobs, fields such as FROM and ADAPTER, explicitly with the
        server using Create a Blob and the value to the path indicated in the
        response.
      summary: |
        Create a model from a Modelfile. It is recommended to set modelfile to
        the content of the Modelfile rather than just set path. This is a
        requirement for remote create. Remote model creation must also create
        any file blobs, fields such as FROM and ADAPTER, explicitly with the
        server using Create a Blob and the value to the path indicated in the
        response.
      requestBody:
        required: true
        description: Request to create a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateRequest'
      responses:
        '200':
          description: The model was successfully created
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/ProgressResponse'

  /api/blobs/{digest}:
    get:
      operationId: getBlob
      tags:
        - blobs
      description: |
        Ensures that the file blob used for a FROM or ADAPTER field exists on
        the server. This is checking your Ollama server and not Ollama.ai.
      summary: |
        Ensures that the file blob used for a FROM or ADAPTER field exists on
        the server. This is checking your Ollama server and not Ollama.ai.
      parameters:
        - name: digest
          in: path
          required: true
          description: The SHA256 digest of the blob
          schema:
            type: string
      responses:
        '200':
          description: The blob exists on the server
        '404':
          description: The blob does not exist on the server
    post:
      operationId: createBlob
      tags:
        - blobs
      description: Create a blob from a file on the server
      summary: Create a blob from a file on the server
      parameters:
        - name: digest
          in: path
          required: true
          description: The SHA256 digest of the blob
          schema:
            type: string
      requestBody:
        required: true
        description: The file to create the blob from
        content:
          application/octet-stream:
            schema:
              type: string
              format: binary
      responses:
        '201':
          description: Blob was successfully created
        '400':
          description: The digest used is not expected

  /api/tags:
    get:
      operationId: getModels
      tags:
        - models
      description: List models that are available locally
      summary: List models that are available locally
      responses:
        '200':
          description: The models were successfully fetched
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListResponse'


  /api/show:
    post:
      operationId: showModel
      tags:
        - models
      description: |
        Show information about a model including details, modelfile, template,
        parameters, license, and system prompt.
      summary: |
        Show information about a model including details, modelfile, template,
        parameters, license, and system prompt.
      requestBody:
        required: true
        description: Request to show a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ShowRequest'
      responses:
        '200':
          description: The model's information was successfully fetched
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ShowResponse'

  /api/copy:
    post:
      operationId: copyModel
      tags:
        - models
      description: |
        Copy a model. Creates a model with another name from an existing model.
      summary: |
        Copy a model. Creates a model with another name from an existing model.
      requestBody:
        required: true
        description: Request to copy a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CopyRequest'
      responses:
        '200':
          description: Model was successfully copied
        '404':
          description: Source model does not exist

  /api/delete:
    delete:
      operationId: deleteModel
      tags:
        - models
      description: Delete a model and its data
      summary: 'Delete a model and its data'
      requestBody:
        required: true
        description: Request to delete a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DeleteRequest'
      responses:
        '200':
          description: Model was successfully deleted
        '404':
          description: Model does not exist

  /api/pull:
    post:
      operationId: pullModel
      tags:
        - models
      description: |
        Download a model from the ollama library. Cancelled pulls are resumed
        from where they left off, and multiple calls will share the same
        download progress.
      summary: |
        Download a model from the ollama library. Cancelled pulls are resumed
        from where they left off, and multiple calls will share the same
        download progress.
      requestBody:
        required: true
        description: Request to pull a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/PullRequest'
      responses:
        '200':
          description: Model was successfully pulled to the server
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/ProgressResponse'

  /api/push:
    post:
      operationId: pushModel
      tags:
        - models
      description: |
        Upload a model to a model library. Requires registering for ollama.ai
        and adding a public key first.
      summary: |
        Upload a model to a model library. Requires registering for ollama.ai
        and adding a public key first.'
      requestBody:
        required: true
        description: Request to push a model
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/PushRequest'
      responses:
        '200':
          description: Model was successfully pushed to the server
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/ProgressResponse'


  /api/embeddings:
    post:
      operationId: generateEmbeddings
      tags:
        - embeddings
        - generate
      description: Generate embeddings from a model
      summary: Generate embeddings from a model
      requestBody:
        required: true
        description: Request to generate embeddings
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EmbeddingRequest'
      responses:
        '200':
          description: The embeddings were successfully generated
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EmbeddingResponse'


  /api/ps:
    get:
      operationId: getRunningModels
      tags:
        - models
      description: List running models
      summary: List running models
      responses:
        '200':
          description: The list of running models was successfully fetched
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProcessResponse'

  /api/version:
    get:
      operationId: getOllamaVersion
      tags:
        - server
      description: Return the Ollama server version
      summary: Return the Ollama server version
      responses:
        '200':
          description: The Ollama server version was successfully fetched
          content:
            application/json:
              schema:
                type: object
                properties:
                  version:
                    type: string

components:
  schemas:
    GenerateRequest:
      type: object
      description: Request to generate a response
      properties:
        model:
          type: string
          description: The model name
        prompt:
          type: string
          description: The prompt to generate a response for
        images:
          type: array
          items:
            type: string
            format: byte
          description: |
            A list of base64-encoded images (for multimodal models such as
            llava)
        format:
          type: string
          description: |
            The format to return a response in. Currently the only accepted
            value is json
        options:
          $ref: '#/components/schemas/Options'
        system:
          type: string
          description: |
            System message to (overrides what is defined in the Modelfile)
        template:
          type: string
          description: |
            The prompt template to use (overrides what is defined in the
            Modelfile)
        context:
          type: array
          items:
            type: integer
          description: |
            The context parameter returned from a previous request to generate,
            this can be used to keep a short conversational memory
          example: []
        stream:
          type: boolean
          description: |
            If false the response will be returned as a single response object,
            rather than a stream of objects
        raw:
          type: boolean
          description: |
            If true no formatting will be applied to the prompt. You may choose
            to use the raw parameter if you are specifying a full templated
            prompt in your request to the API
        keep_alive:
          $ref: '#/components/schemas/Duration'
      required:
        - model
        - prompt

    GenerateResponse:
      type: object
      description: Response from a generate request
      properties:
        model:
          type: string
          description: The model name that generated the response
        created_at:
          type: string
          format: date-time
          description: Timestamp of the response
        response:
          type: string
          description: |
            The textual response itself. When done, empty if the response was
            streamed, if not streamed, this will contain the full response
        done:
          type: boolean
          description: Specifies if the response is complete
        context:
          type: array
          items:
            type: integer
          description: |
            When done, encoding of the conversation used in this response
        total_duration:
          type: number
          description: When done, time spent generating the response
        load_duration:
          type: number
          description: When done, time spent in nanoseconds loading the model
        prompt_eval_count:
          type: integer
          description: When done, number of tokens in the prompt
        prompt_eval_duration:
          type: number
          description: |
            When done, time spent in nanoseconds evaluating the prompt
        eval_count:
          type: integer
          description: When done, number of tokens in the response
        eval_duration:
          type: number
          description: |
            When done, time in nanoseconds spent generating the response

    ChatRequest:
      type: object
      description: Request to generate a response in a chat
      properties:
        model:
          type: string
          description: The model name
        messages:
          type: array
          items:
            $ref: '#/components/schemas/Message'
          description: Messages of the chat - can be used to keep a chat memory
        stream:
          type: boolean
          description: Enable streaming of returned response
        format:
          type: string
          description: Format to return the response in (e.g. "json")
        keep_alive:
          $ref: '#/components/schemas/Duration'
        options:
          $ref: '#/components/schemas/Options'

    ChatResponse:
      type: object
      description: Response from a chat request
      properties:
        model:
          type: string
          description: The model name
        created_at:
          type: string
          format: date-time
          description: Timestamp of the response
        message:
          $ref: '#/components/schemas/Message'
        done_reason:
          type: string
          description: Reason the model stopped generating text
        done:
          type: boolean
          description: Specifies if the response is complete
        total_duration:
          type: number
          description: Total duration of the request
        load_duration:
          type: number
          description: Load duration of the request
        prompt_eval_count:
          type: integer
          description: Count of prompt evaluations
        prompt_eval_duration:
          type: number
          description: Duration of prompt evaluations
        eval_count:
          type: integer
          description: Count of evaluations
        eval_duration:
          type: number
          description: Duration of evaluations

    CreateRequest:
      type: object
      description: Request to create a model
      properties:
        model:
          type: string
          description: The name of the model to create
          example: mario
        path:
          type: string
          description: The path to the model file
        modelfile:
          type: string
          description: The modelfile content
          example: FROM llama3\nSYSTEM You are mario from Super Mario Bros.
        stream:
          type: boolean
          description: |
            If false the response will be returned as a single response object,
            rather than a stream of objects
        quantize:
          type: string
          description: Specifies the quantization level of the model
      required:
        - model

    ListResponse:
      type: object
      description: Response from a list request
      properties:
        models:
          type: array
          items:
            $ref: '#/components/schemas/ListModelResponse'

    ListModelResponse:
      type: object
      description: Response from a list request
      properties:
        name:
          type: string
        model:
          type: string
        modified_at:
          type: string
          format: date-time
        size:
          type: integer
        digest:
          type: string
        details:
          $ref: '#/components/schemas/ModelDetails'

    ShowRequest:
      type: object
      description: Request to show a model
      properties:
        model:
          type: string
          description: The name of the model to show
      required:
        - model

    ShowResponse:
      type: object
      description: Response from a show request
      properties:
        license:
          type: string
          description: The model license
        modelfile:
          type: string
          description: The modelfile content
        parameters:
          type: string
          description: The model parameters
        template:
          type: string
          description: The model template
        system:
          type: string
          description: The model system message/prompt
        details:
          $ref: '#/components/schemas/ModelDetails'
        messages:
          type: array
          items:
            $ref: '#/components/schemas/Message'

    CopyRequest:
      type: object
      description: Request to copy a model
      properties:
        source:
          type: string
        destination:
          type: string

    DeleteRequest:
      type: object
      description: Request to delete a model
      properties:
        model:
          type: string
          description: The name of the model to delete
      required:
        - model

    PullRequest:
      type: object
      description: Request to pull a model
      properties:
        model:
          type: string
          description: The name of the model to pull
          example: llama3
        insecure:
          type: boolean
          description: |
            allow insecure connections to the library. Only use this if you are
             pulling from your own library during development.
        stream:
          type: boolean
          description: |
            If false the response will be returned as a single response object,
            rather than a stream of objects
      required:
        - model

    PushRequest:
      type: object
      description: Request to push a model
      properties:
        model:
          type: string
          description: |
            The name of the model to push in the form of <namespace>/<model>:<tag>
        insecure:
          type: boolean
          description: |
            Whether to allow insecure connections to the library. Only use this
            if you are pushing to your library during development
        stream:
          type: boolean
          description: |
            If false the response will be returned as a single response object,
            rather than a stream of objects
      required:
        - model

    ProgressResponse:
      type: object
      description: The response returned from various streaming endpoints
      properties:
        status:
          type: string
          description: The status of the request
        digest:
          type: string
          description: The SHA256 digest of the blob
        total:
          type: integer
          description: The total size of the task
        completed:
          type: integer
          description: The completed size of the task

    EmbeddingRequest:
      type: object
      description: Request to generate embeddings
      properties:
        model:
          type: string
          description: The name of model to generate embeddings from
        prompt:
          type: string
          description: The text to generate embeddings for
        keep_alive:
          $ref: '#/components/schemas/Duration'
        options:
          $ref: '#/components/schemas/Options'
      required:
        - model
        - prompt

    EmbeddingResponse:
      type: object
      description: Response from an embedding request
      properties:
        embedding:
          type: array
          items:
            type: number
          description: The generated embeddings

    ProcessResponse:
      type: object
      description: Response with a list of running models
      properties:
        models:
          type: array
          items:
            $ref: '#/components/schemas/ProcessModelResponse'

    ProcessModelResponse:
      type: object
      description: Running model description
      properties:
        name:
          type: string
        model:
          type: string
        size:
          type: integer
        digest:
          type: string
        details:
          $ref: '#/components/schemas/ModelDetails'
        expires_at:
          type: string
          format: date-time
        size_vram:
          type: integer

    Message:
      type: object
      description: A message in a chat
      properties:
        role:
          type: string
        content:
          type: string
        images:
          type: array
          items:
            type: string
            format: byte

    ModelDetails:
      type: object
      description: Details about a model
      properties:
        parent_model:
          type: string
        format:
          type: string
        family:
          type: string
        families:
          type: array
          items:
            type: string
        parameter_size:
          type: string
        quantization_level:
          type: string

    Duration:
      type: string
      description: A string representing the duration
      example: "5m"

    Options:
      type: object
      description: |
        Advanced model and runner options for generation and chat requests
      properties:
        num_keep:
          type: integer
          description: |
            Specifies the number of tokens from the beginning of
            the context ot retain when the context limit is reached.
            (Default: 4)
          example: 4
        seed:
          type: integer
          description: |
            Sets the random number seed to use for generation. Setting this to
            a specific number will make the model generate the same text for
            the same prompt.
            (Default: 0)
          example: -1
        num_predict:
          type: integer
          description: |
            Maximum number of tokens to predict when generating text.
            (Default: 128, -1 = infinite generation, -2 = fill context)
          example: -1
        top_k:
          type: integer
          description: |
            Reduces the probability of generating nonsense. A higher value
            (e.g. 100) will give more diverse answers, while a lower value
            (e.g. 10) will be more conservative.
            (Default: 40)
          example: 40
        top_p:
          type: number
          format: float
          description: |
            Works together with top-k. A higher value (e.g., 0.95) will lead to
            more diverse text, while a lower value (e.g., 0.5) will generate
            more focused and conservative text.
            (Default: 0.9)
          example: 0.9
        tfs_z:
          type: number
          format: float
          description: |
            Tail free sampling is used to reduce the impact of less probable
            tokens from the output. A higher value (e.g., 2.0) will reduce the
            impact more, while a value of 1.0 disables this setting.
            (default: 1)
          example: 1.0
        typical_p:
          type: number
          format: float
          description: |
            Controls the selection of typical words based on their probability
            distribution. A higher value (e.g., 0.95) focuses on more typical
            words, reducing the chance of unusual words being selected.
            (Default: 1.0)
          example: 1.0
        repeat_last_n:
          type: integer
          description: |
            Sets how far back for the model to look back to prevent repetition.
            (Default: 64, 0 = disabled, -1 = num_ctx)
          example: 64
        temperature:
          type: number
          format: float
          description: |
            The temperature of the model. Increasing the temperature will make
            the model answer more creatively.
            (Default: 0.8)
          example: 0.8
        repeat_penalty:
          type: number
          format: float
          description: |
            Sets how strongly to penalize repetitions. A higher value
            (e.g., 1.5) will penalize repetitions more strongly, while a lower
            value (e.g., 0.9) will be more lenient.
            (Default: 1.1)
          example: 1.1
        presence_penalty:
          type: number
          format: float
          description: |
            Applies a penalty to tokens that have already appeared in the
            generated text, encouraging the model to introduce new tokens. A
            higher value increases this penalty, promoting more varied and less
            repetitive output.
            (Default: 0.8)
          example: 0.8
        frequency_penalty:
          type: number
          format: float
          description: |
            Penalizes tokens based on their frequency in the generated text so
            far. A higher value reduces the likelihood of frequent tokens being
            generated again, promoting more diverse outputs.
            (Default: 0.8)
          example: 0.8
        mirostat:
          type: number
          format: float
          description: |
            Enable Mirostat sampling for controlling perplexity.
            (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
          example: 0
        mirostat_tau:
          type: number
          format: float
          description: |
            Controls the balance between coherence and diversity of the output.
            A lower value will result in more focused and coherent text.
            (Default: 5.0)
          example: 5.8
        mirostat_eta:
          type: number
          format: float
          description: |
            Influences how quickly the algorithm responds to feedback from the
            generated text. A lower learning rate will result in slower
            adjustments, while a higher learning rate will make the algorithm
            more responsive.
            (Default: 0.1)
          example: 0.1
        penalize_newline:
          type: boolean
          description: |
            Determines whether the model should penalize the generation of
            newlines, which can help control the structure and formatting of
            the output.
            (Default: true)
          example: true
        stop:
          type: array
          items:
            type: string
          description: |
            Sets the stop sequences to use. When this pattern is encountered
            the LLM will stop generating text and return. Multiple stop patterns
            may be set by specifying multiple separate stop parameters in a
            modelfile.
          example: ['AI assistant.']
        numa:
          type: boolean
          description: |
            Indicates whether to use Non-Uniform Memory Access (NUMA) for
            optimizing memory usage and performance on multi-processor systems.
            (Default: false)
          example: false
        num_ctx:
          type: integer
          description: |
            Sets the size of the context window used to generate the next token.
            (Default: 2048)
          example: 2048
        num_batch:
          type: integer
          description: |
            Specifies the number of batches for processing.
            (Default: 512)
          example: 512
        num_gpu:
          type: integer
          description: |
            Specifies the number of GPUs to use. A value of -1 uses all
            available GPUs.
            (Default: -1)
          example: -1
        main_gpu:
          type: integer
          description: |
            Specifies the primary GPU to use for processing.
            (Default: 0)
        low_vram:
          type: boolean
          description: |
            Indicates whether to optimize the model for low VRAM usage.
            (Default: false)
          example: false
        f16_kv:
          type: boolean
          description: |
            Indicates whether to use 16-bit floating point precision for
            key-value pairs, reducing memory usage.
            (Default: false)
          example: true
        logits_all:
          type: boolean
          description: |
            Specifies whether to output logits for all tokens.
            (Default: false)
          example: false
        vocab_only:
          type: boolean
          description: |
            Indicates whether to only load the vocabulary without the full model.
            (Default: false)
          example: false
        use_mmap:
          type: boolean
          description: |
            Determines whether to use memory-mapped files for loading the model,
            improving performance on large models.
            (Default: true)
          example: true
        use_mlock:
          type: boolean
          description: |
            Determines whether to use memory locking to prevent swapping the
            model out of RAM.
            (Default: false)
          example: false
        num_thread:
          type: integer
          description: |
            Specifies the number of threads to use for processing. A value of
            0 uses all available threads.
            (Default: 0)
          example: 0