Skip to main content

AI Inference

Package: com.mimik.mimoe.inference

OpenAI-compatible chat completions, streaming, and embeddings.

ChatCompletion

@Serializable
data class ChatCompletion(
val id: String,
val objectType: String = "chat.completion",
val created: Long = 0,
val model: String = "",
val choices: List<ChatCompletion.Choice> = emptyList(),
val usage: ChatCompletion.Usage? = null
)

Response from a chat completion request.

Constructors

constructor(
id: String,
objectType: String = "chat.completion",
created: Long = 0,
model: String = "",
choices: List<ChatCompletion.Choice> = emptyList(),
usage: ChatCompletion.Usage? = null
)

Types

NameSummary
Choice@Serializable
data class Choice(val index: Int = 0, val message: ChatMessage, val finishReason: String? = null)
Usage@Serializable
data class Usage(val promptTokens: Int = 0, val completionTokens: Int = 0, val totalTokens: Int = 0)

Properties

NameSummary
choicesval choices: List<ChatCompletion.Choice>
createdval created: Long = 0
idval id: String
modelval model: String
objectType@SerialName(value = "object")
val objectType: String
usageval usage: ChatCompletion.Usage? = null

ChatCompletionChunk

@Serializable
data class ChatCompletionChunk(
val id: String,
val objectType: String = "chat.completion.chunk",
val created: Long = 0,
val model: String = "",
val choices: List<ChatCompletionChunk.ChunkChoice> = emptyList()
)

A streaming chunk from a chat completion.

Constructors

constructor(
id: String,
objectType: String = "chat.completion.chunk",
created: Long = 0,
model: String = "",
choices: List<ChatCompletionChunk.ChunkChoice> = emptyList()
)

Types

NameSummary
ChunkChoice@Serializable
data class ChunkChoice(val index: Int = 0, val delta: ChatCompletionChunk.Delta, val finishReason: String? = null)
Delta@Serializable
data class Delta(val role: String? = null, val content: String? = null)

Properties

NameSummary
choicesval choices: List<ChatCompletionChunk.ChunkChoice>
createdval created: Long = 0
idval id: String
modelval model: String
objectType@SerialName(value = "object")
val objectType: String

ChatMessage

@Serializable
data class ChatMessage(val role: String, val content: String)

A message in a chat conversation.

Constructors

constructor(role: String, content: String)

Properties

NameSummary
contentval content: String
Message content.
roleval role: String
Role of the message sender (system, user, assistant).

EmbeddingData

@Serializable
data class EmbeddingData(
val objectType: String = "embedding",
val embedding: List<Float>,
val index: Int = 0
)

Constructors

constructor(
objectType: String = "embedding",
embedding: List<Float>,
index: Int = 0
)

Properties

NameSummary
embeddingval embedding: List<Float>
indexval index: Int = 0
objectType@SerialName(value = "object")
val objectType: String

EmbeddingResponse

@Serializable
data class EmbeddingResponse(
val objectType: String = "list",
val data: List<EmbeddingData> = emptyList(),
val model: String = "",
val usage: EmbeddingUsage? = null
)

Response from an embeddings request.

Constructors

constructor(
objectType: String = "list",
data: List<EmbeddingData> = emptyList(),
model: String = "",
usage: EmbeddingUsage? = null
)

Properties

NameSummary
dataval data: List<EmbeddingData>
modelval model: String
objectType@SerialName(value = "object")
val objectType: String
usageval usage: EmbeddingUsage? = null

EmbeddingUsage

@Serializable
data class EmbeddingUsage(
val promptTokens: Int = 0,
val totalTokens: Int = 0
)

Constructors

constructor(promptTokens: Int = 0, totalTokens: Int = 0)

Properties

NameSummary
promptTokens@SerialName(value = "prompt_tokens")
val promptTokens: Int = 0
totalTokens@SerialName(value = "total_tokens")
val totalTokens: Int = 0

InferenceClient

class InferenceClient

Inference sub-client.

Provides OpenAI-compatible chat completions, streaming, embeddings, and model cache management. Base path: /mimik-ai/openai/v1 Requires the ai-foundation addon.

Access via client.inference.

Functions

chatCompletion

suspend fun chatCompletion(model: String, messages: List<ChatMessage>, temperature: Double? = null, topP: Double? = null, maxTokens: Int? = null): Result<ChatCompletion>

Generate a chat completion.

chatCompletionStream

fun chatCompletionStream(model: String, messages: List<ChatMessage>, temperature: Double? = null, topP: Double? = null, maxTokens: Int? = null): Flow<ChatCompletionChunk>

Stream a chat completion via SSE.

createEmbeddings

suspend fun createEmbeddings(model: String, input: List<String>): Result<EmbeddingResponse>

Generate text embeddings.

listModels

suspend fun listModels(): Result<List<LoadedModel>>

List models currently loaded in the runtime cache.

loadModel

suspend fun loadModel(model: String, chatTemplateHint: String? = null, contextSize: Int? = null, gpuLayerSize: Int? = null): Result<LoadedModel>

Load a model from the Model Registry into runtime cache.

unloadModel

suspend fun unloadModel(modelId: String): Result<Unit>

Unload a model from runtime cache.


LoadedModel

@Serializable
data class LoadedModel(
val id: String,
val objectType: String = "model",
val created: Long = 0,
val ownedBy: String = "",
val info: ModelInfoDetail? = null,
val metrics: ModelMetrics? = null
)

A model loaded in the inference runtime cache.

Returned by GET /mimik-ai/openai/v1/models and POST /mimik-ai/openai/v1/models.

Constructors

constructor(
id: String,
objectType: String = "model",
created: Long = 0,
ownedBy: String = "",
info: ModelInfoDetail? = null,
metrics: ModelMetrics? = null
)

Properties

NameSummary
createdval created: Long = 0
Creation timestamp (Unix epoch seconds).
idval id: String
Model identifier.
infoval info: ModelInfoDetail? = null
Model metadata.
metricsval metrics: ModelMetrics? = null
Runtime performance metrics.
objectType@SerialName(value = "object")
val objectType: String
Object type (always "model").
ownedBy@SerialName(value = "owned_by")
val ownedBy: String
Model owner.

ModelInfoDetail

@Serializable
data class ModelInfoDetail(
val kind: String = "",
val chatTemplateHint: String = "",
val nGpuLayers: Int = 0,
val maxContext: Int = 0,
val nVocab: Int = 0,
val nCtxTrain: Int = 0,
val nEmbd: Int = 0,
val nParams: Long = 0,
val modelSize: Long = 0
)

Detailed model metadata.

Constructors

constructor(
kind: String = "",
chatTemplateHint: String = "",
nGpuLayers: Int = 0,
maxContext: Int = 0,
nVocab: Int = 0,
nCtxTrain: Int = 0,
nEmbd: Int = 0,
nParams: Long = 0,
modelSize: Long = 0
)

Properties

NameSummary
chatTemplateHint@SerialName(value = "chat_template_hint")
val chatTemplateHint: String
Applied chat template.
kindval kind: String
Model kind: "llm", "vlm", or "embed".
maxContext@SerialName(value = "max_context")
val maxContext: Int = 0
Maximum context size.
modelSize@SerialName(value = "model_size")
val modelSize: Long = 0
Model file size in bytes.
nCtxTrain@SerialName(value = "n_ctx_train")
val nCtxTrain: Int = 0
Training context length.
nEmbd@SerialName(value = "n_embd")
val nEmbd: Int = 0
Embedding dimension.
nGpuLayers@SerialName(value = "n_gpu_layers")
val nGpuLayers: Int = 0
Number of GPU-offloaded layers.
nParams@SerialName(value = "n_params")
val nParams: Long = 0
Total parameter count.
nVocab@SerialName(value = "n_vocab")
val nVocab: Int = 0
Vocabulary size.

ModelMetrics

@Serializable
data class ModelMetrics(
val inferenceCount: Int = 0,
val lastUsed: Long = 0,
val loadedAt: Long = 0,
val tokensPerSecond: Double = 0.0,
val avgTokensPerSecond: Double = 0.0,
val lastLatencyMs: Double = 0.0,
val avgLatencyMs: Double = 0.0
)

Runtime performance metrics for a loaded model.

Constructors

constructor(
inferenceCount: Int = 0,
lastUsed: Long = 0,
loadedAt: Long = 0,
tokensPerSecond: Double = 0.0,
avgTokensPerSecond: Double = 0.0,
lastLatencyMs: Double = 0.0,
avgLatencyMs: Double = 0.0
)

Properties

NameSummary
avgLatencyMs@SerialName(value = "avg_latency_ms")
val avgLatencyMs: Double = 0.0
Average latency (ms). Embed only.
avgTokensPerSecond@SerialName(value = "avg_tokens_per_second")
val avgTokensPerSecond: Double = 0.0
Average throughput (tokens/sec). LLM/VLM only.
inferenceCount@SerialName(value = "inference_count")
val inferenceCount: Int = 0
Total inference calls.
lastLatencyMs@SerialName(value = "last_latency_ms")
val lastLatencyMs: Double = 0.0
Most recent latency (ms). Embed only.
lastUsed@SerialName(value = "last_used")
val lastUsed: Long = 0
Most recent usage (Unix epoch seconds).
loadedAt@SerialName(value = "loaded_at")
val loadedAt: Long = 0
When the model was loaded (Unix epoch seconds).
tokensPerSecond@SerialName(value = "tokens_per_second")
val tokensPerSecond: Double = 0.0
Most recent throughput (tokens/sec). LLM/VLM only.