AI Inference
Package: com.mimik.mimoe.inference
OpenAI-compatible chat completions, streaming, and embeddings.
ChatCompletion
@Serializable
data class ChatCompletion(
val id: String,
val objectType: String = "chat.completion",
val created: Long = 0,
val model: String = "",
val choices: List<ChatCompletion.Choice> = emptyList(),
val usage: ChatCompletion.Usage? = null
)
Response from a chat completion request.
Constructors
constructor(
id: String,
objectType: String = "chat.completion",
created: Long = 0,
model: String = "",
choices: List<ChatCompletion.Choice> = emptyList(),
usage: ChatCompletion.Usage? = null
)
Types
| Name | Summary |
|---|---|
Choice | @Serializable data class Choice(val index: Int = 0, val message: ChatMessage, val finishReason: String? = null) |
Usage | @Serializable data class Usage(val promptTokens: Int = 0, val completionTokens: Int = 0, val totalTokens: Int = 0) |
Properties
| Name | Summary |
|---|---|
choices | val choices: List<ChatCompletion.Choice> |
created | val created: Long = 0 |
id | val id: String |
model | val model: String |
objectType | @SerialName(value = "object") val objectType: String |
usage | val usage: ChatCompletion.Usage? = null |
ChatCompletionChunk
@Serializable
data class ChatCompletionChunk(
val id: String,
val objectType: String = "chat.completion.chunk",
val created: Long = 0,
val model: String = "",
val choices: List<ChatCompletionChunk.ChunkChoice> = emptyList()
)
A streaming chunk from a chat completion.
Constructors
constructor(
id: String,
objectType: String = "chat.completion.chunk",
created: Long = 0,
model: String = "",
choices: List<ChatCompletionChunk.ChunkChoice> = emptyList()
)
Types
| Name | Summary |
|---|---|
ChunkChoice | @Serializable data class ChunkChoice(val index: Int = 0, val delta: ChatCompletionChunk.Delta, val finishReason: String? = null) |
Delta | @Serializable data class Delta(val role: String? = null, val content: String? = null) |
Properties
| Name | Summary |
|---|---|
choices | val choices: List<ChatCompletionChunk.ChunkChoice> |
created | val created: Long = 0 |
id | val id: String |
model | val model: String |
objectType | @SerialName(value = "object") val objectType: String |
ChatMessage
@Serializable
data class ChatMessage(val role: String, val content: String)
A message in a chat conversation.
Constructors
constructor(role: String, content: String)
Properties
| Name | Summary |
|---|---|
content | val content: StringMessage content. |
role | val role: StringRole of the message sender (system, user, assistant). |
EmbeddingData
@Serializable
data class EmbeddingData(
val objectType: String = "embedding",
val embedding: List<Float>,
val index: Int = 0
)
Constructors
constructor(
objectType: String = "embedding",
embedding: List<Float>,
index: Int = 0
)
Properties
| Name | Summary |
|---|---|
embedding | val embedding: List<Float> |
index | val index: Int = 0 |
objectType | @SerialName(value = "object") val objectType: String |
EmbeddingResponse
@Serializable
data class EmbeddingResponse(
val objectType: String = "list",
val data: List<EmbeddingData> = emptyList(),
val model: String = "",
val usage: EmbeddingUsage? = null
)
Response from an embeddings request.
Constructors
constructor(
objectType: String = "list",
data: List<EmbeddingData> = emptyList(),
model: String = "",
usage: EmbeddingUsage? = null
)
Properties
| Name | Summary |
|---|---|
data | val data: List<EmbeddingData> |
model | val model: String |
objectType | @SerialName(value = "object") val objectType: String |
usage | val usage: EmbeddingUsage? = null |
EmbeddingUsage
@Serializable
data class EmbeddingUsage(
val promptTokens: Int = 0,
val totalTokens: Int = 0
)
Constructors
constructor(promptTokens: Int = 0, totalTokens: Int = 0)
Properties
| Name | Summary |
|---|---|
promptTokens | @SerialName(value = "prompt_tokens") val promptTokens: Int = 0 |
totalTokens | @SerialName(value = "total_tokens") val totalTokens: Int = 0 |
InferenceClient
class InferenceClient
Inference sub-client.
Provides OpenAI-compatible chat completions, streaming, embeddings, and model cache management. Base path: /mimik-ai/openai/v1 Requires the ai-foundation addon.
Access via client.inference.
Functions
chatCompletion
suspend fun chatCompletion(model: String, messages: List<ChatMessage>, temperature: Double? = null, topP: Double? = null, maxTokens: Int? = null): Result<ChatCompletion>
Generate a chat completion.
chatCompletionStream
fun chatCompletionStream(model: String, messages: List<ChatMessage>, temperature: Double? = null, topP: Double? = null, maxTokens: Int? = null): Flow<ChatCompletionChunk>
Stream a chat completion via SSE.
createEmbeddings
suspend fun createEmbeddings(model: String, input: List<String>): Result<EmbeddingResponse>
Generate text embeddings.
listModels
suspend fun listModels(): Result<List<LoadedModel>>
List models currently loaded in the runtime cache.
loadModel
suspend fun loadModel(model: String, chatTemplateHint: String? = null, contextSize: Int? = null, gpuLayerSize: Int? = null): Result<LoadedModel>
Load a model from the Model Registry into runtime cache.
unloadModel
suspend fun unloadModel(modelId: String): Result<Unit>
Unload a model from runtime cache.
LoadedModel
@Serializable
data class LoadedModel(
val id: String,
val objectType: String = "model",
val created: Long = 0,
val ownedBy: String = "",
val info: ModelInfoDetail? = null,
val metrics: ModelMetrics? = null
)
A model loaded in the inference runtime cache.
Returned by GET /mimik-ai/openai/v1/models and POST /mimik-ai/openai/v1/models.
Constructors
constructor(
id: String,
objectType: String = "model",
created: Long = 0,
ownedBy: String = "",
info: ModelInfoDetail? = null,
metrics: ModelMetrics? = null
)
Properties
| Name | Summary |
|---|---|
created | val created: Long = 0Creation timestamp (Unix epoch seconds). |
id | val id: StringModel identifier. |
info | val info: ModelInfoDetail? = nullModel metadata. |
metrics | val metrics: ModelMetrics? = nullRuntime performance metrics. |
objectType | @SerialName(value = "object") val objectType: StringObject type (always "model"). |
ownedBy | @SerialName(value = "owned_by") val ownedBy: StringModel owner. |
ModelInfoDetail
@Serializable
data class ModelInfoDetail(
val kind: String = "",
val chatTemplateHint: String = "",
val nGpuLayers: Int = 0,
val maxContext: Int = 0,
val nVocab: Int = 0,
val nCtxTrain: Int = 0,
val nEmbd: Int = 0,
val nParams: Long = 0,
val modelSize: Long = 0
)
Detailed model metadata.
Constructors
constructor(
kind: String = "",
chatTemplateHint: String = "",
nGpuLayers: Int = 0,
maxContext: Int = 0,
nVocab: Int = 0,
nCtxTrain: Int = 0,
nEmbd: Int = 0,
nParams: Long = 0,
modelSize: Long = 0
)
Properties
| Name | Summary |
|---|---|
chatTemplateHint | @SerialName(value = "chat_template_hint") val chatTemplateHint: StringApplied chat template. |
kind | val kind: StringModel kind: "llm", "vlm", or "embed". |
maxContext | @SerialName(value = "max_context") val maxContext: Int = 0Maximum context size. |
modelSize | @SerialName(value = "model_size") val modelSize: Long = 0Model file size in bytes. |
nCtxTrain | @SerialName(value = "n_ctx_train") val nCtxTrain: Int = 0Training context length. |
nEmbd | @SerialName(value = "n_embd") val nEmbd: Int = 0Embedding dimension. |
nGpuLayers | @SerialName(value = "n_gpu_layers") val nGpuLayers: Int = 0Number of GPU-offloaded layers. |
nParams | @SerialName(value = "n_params") val nParams: Long = 0Total parameter count. |
nVocab | @SerialName(value = "n_vocab") val nVocab: Int = 0Vocabulary size. |
ModelMetrics
@Serializable
data class ModelMetrics(
val inferenceCount: Int = 0,
val lastUsed: Long = 0,
val loadedAt: Long = 0,
val tokensPerSecond: Double = 0.0,
val avgTokensPerSecond: Double = 0.0,
val lastLatencyMs: Double = 0.0,
val avgLatencyMs: Double = 0.0
)
Runtime performance metrics for a loaded model.
Constructors
constructor(
inferenceCount: Int = 0,
lastUsed: Long = 0,
loadedAt: Long = 0,
tokensPerSecond: Double = 0.0,
avgTokensPerSecond: Double = 0.0,
lastLatencyMs: Double = 0.0,
avgLatencyMs: Double = 0.0
)
Properties
| Name | Summary |
|---|---|
avgLatencyMs | @SerialName(value = "avg_latency_ms") val avgLatencyMs: Double = 0.0Average latency (ms). Embed only. |
avgTokensPerSecond | @SerialName(value = "avg_tokens_per_second") val avgTokensPerSecond: Double = 0.0Average throughput (tokens/sec). LLM/VLM only. |
inferenceCount | @SerialName(value = "inference_count") val inferenceCount: Int = 0Total inference calls. |
lastLatencyMs | @SerialName(value = "last_latency_ms") val lastLatencyMs: Double = 0.0Most recent latency (ms). Embed only. |
lastUsed | @SerialName(value = "last_used") val lastUsed: Long = 0Most recent usage (Unix epoch seconds). |
loadedAt | @SerialName(value = "loaded_at") val loadedAt: Long = 0When the model was loaded (Unix epoch seconds). |
tokensPerSecond | @SerialName(value = "tokens_per_second") val tokensPerSecond: Double = 0.0Most recent throughput (tokens/sec). LLM/VLM only. |