This vignette shows how to evaluate the local Liquid AI
LFM2.5-8B-A1B GGUF with vitals. The
goal is to evaluate model behavior on small, reproducible tasks,
including tool use. It is not a reproduction of Liquid AI’s benchmark
harnesses.
The source is in vignettes-raw/; the rendered file in
vignettes/ is precompiled with rawvignette so
R CMD check, pkgdown, and package installs do not rerun the
model.
library(Rbebelm)
data.frame(
model_file = if (has_weights) basename(weights_file) else NA_character_,
has_weights = has_weights,
has_vitals = has_vitals
)
#> model_file has_weights has_vitals
#> 1 LFM2.5-8B-A1B-Q4_K_M.gguf TRUE TRUEIf vitals is not installed, install it before
regenerating this vignette:
vitals::Task expects solvers to return
result and solver_chat. Rbebelm
is not yet an ellmer provider, so the helper below returns a minimal
placeholder object with class "Chat" for
vitals bookkeeping while storing Rbebelm-specific trace
data in solver_metadata. The task is solved, scored, and
measured; it is not logged to the Inspect viewer in this vignette.
library(vitals)
model <- bebel_model_load(weights_file, num_threads = num_threads)
rbebelm_vitals_chat <- function(model_name = "Rbebelm/LFM2.5-8B-A1B-GGUF") {
structure(
list(
get_model = function() model_name,
get_system_prompt = function() "",
get_turns = function() list()
),
class = "Chat"
)
}
score_factor <- function(ok) {
factor(ifelse(ok, "C", "I"), levels = c("I", "C"), ordered = TRUE)
}
vitals::vitals_log_dir_set(tempdir())
excerpt <- function(x, n = 120) {
x <- gsub("\\s+", " ", x)
ifelse(nchar(x) > n, paste0(substr(x, 1, n), "..."), x)
}
or_unknown <- function(x) {
if (is.null(x) || length(x) == 0L || is.na(x)) "unknown" else x
}
run_task <- function(task, ...) {
task$solve(...)
task$score()
task$measure()
list(samples = task$get_samples(), metrics = task$metrics)
}This task checks short deterministic factual answers. The scorer
extracts an ANSWER: line and compares it to the target.
factual_data <- data.frame(
input = c(
"Return only 'ANSWER: Bamako'. What is the capital of Mali?",
"Return only 'ANSWER: Rome'. What is the capital of Italy?",
"Return only 'ANSWER: Tokyo'. What is the capital of Japan?"
),
target = c("Bamako", "Rome", "Tokyo")
)
rbebelm_qa_solver <- function(inputs, model, max_gen = 48, ...) {
result <- character(length(inputs))
metadata <- vector("list", length(inputs))
for (i in seq_along(inputs)) {
agent <- bebel_agent(model, greedy = TRUE, max_gen = max_gen, max_think = 0)
bebel_append_user(agent, inputs[[i]])
turn <- bebel_assistant_turn(agent, on_event = NULL)
result[[i]] <- turn$text
metadata[[i]] <- list(
chars = nchar(turn$text),
history_tokens = bebel_agent_info(agent)$history_tokens
)
}
list(
result = result,
solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
solver_metadata = metadata
)
}
answer_line_scorer <- function(samples) {
extracted <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE)
matched <- grepl("ANSWER:", samples$result, ignore.case = TRUE) &
trimws(tolower(extracted)) == trimws(tolower(samples$target))
list(
score = score_factor(matched),
explanation = ifelse(matched, "matched ANSWER line", "missing or incorrect ANSWER line"),
scorer_metadata = Map(function(answer, target) list(answer = answer, target = target), extracted, samples$target)
)
}
qa_task <- vitals::Task$new(
dataset = factual_data,
solver = rbebelm_qa_solver,
scorer = answer_line_scorer,
name = "rbebelm-factual-qa"
)
qa_eval <- run_task(qa_task, model = model)
qa_eval$metrics
#> accuracy
#> 100
data.frame(
sample = seq_len(nrow(qa_eval$samples)),
target = qa_eval$samples$target,
result = excerpt(qa_eval$samples$result),
score = as.character(qa_eval$samples$score),
explanation = qa_eval$samples$scorer_explanation,
row.names = NULL
)
#> sample target result score explanation
#> 1 1 Bamako < </think> ANSWER: Bamako C matched ANSWER line
#> 2 2 Rome < </think> ANSWER: Rome C matched ANSWER line
#> 3 3 Tokyo < </think> ANSWER: Tokyo C matched ANSWER lineThis task evaluates whether the model emits the requested tool call and whether the agent loop reaches the expected final answer after tool execution. The tools return values from a private R context, so the answer cannot be obtained by calling an external service.
tool_data <- data.frame(
input = c(
"Do not answer from memory. Emit exactly [lookup_capital(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: <tool result>'.",
"Do not answer from memory. Emit exactly [lookup_currency(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: <tool result>'.",
"Do not answer from memory. Emit exactly [lookup_capital(country=\"Italy\")]. After the tool result, answer exactly 'ANSWER: <tool result>'."
),
target = c("Bamako", "XOF", "Rome"),
expected_tool = c("lookup_capital", "lookup_currency", "lookup_capital")
)
rbebelm_tool_solver <- function(inputs, model, expected_tool, max_steps = 3, ...) {
result <- character(length(inputs))
metadata <- vector("list", length(inputs))
for (i in seq_along(inputs)) {
context <- new.env(parent = emptyenv())
context$calls <- character()
lookup_capital <- bebel_tool("lookup_capital", function(args, context) {
country <- args$country
context$calls <- c(context$calls, paste0("lookup_capital:", country))
or_unknown(c(Mali = "Bamako", Italy = "Rome", Japan = "Tokyo")[[country]])
})
lookup_currency <- bebel_tool("lookup_currency", function(args, context) {
country <- args$country
context$calls <- c(context$calls, paste0("lookup_currency:", country))
or_unknown(c(Mali = "XOF", Japan = "yen", Italy = "euro")[[country]])
})
agent <- bebel_agent(model, greedy = TRUE, max_gen = 128, max_think = 0)
bebel_append_user(agent, inputs[[i]])
run <- tryCatch(
bebel_agent_run(
agent,
tools = list(lookup_capital, lookup_currency),
context = context,
max_steps = max_steps
),
error = function(e) e
)
if (inherits(run, "error")) {
result[[i]] <- bebel_transcript(agent)
loop_error <- conditionMessage(run)
} else {
result[[i]] <- or_unknown(tail(run$turns, 1)[[1]]$text)
if (identical(result[[i]], "unknown")) result[[i]] <- bebel_transcript(agent)
loop_error <- NA_character_
}
metadata[[i]] <- list(
calls = context$calls,
call_count = length(context$calls),
expected_tool = expected_tool[[i]],
expected_tool_called = any(startsWith(context$calls, paste0(expected_tool[[i]], ":"))),
loop_error = loop_error
)
}
list(
result = result,
solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
solver_metadata = metadata
)
}
tool_scorer <- function(samples) {
metadata <- samples$solver_metadata
answer <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE)
answer_ok <- grepl("ANSWER:", samples$result, ignore.case = TRUE) &
trimws(tolower(answer)) == trimws(tolower(samples$target))
tool_ok <- vapply(metadata, function(x) isTRUE(x$expected_tool_called), logical(1))
ok <- answer_ok & tool_ok
list(
score = score_factor(ok),
explanation = ifelse(ok, "expected tool and answer observed", "missing expected tool call or final answer"),
scorer_metadata = Map(
function(answer, answer_ok, tool_ok) list(answer = answer, answer_ok = answer_ok, tool_ok = tool_ok),
answer, answer_ok, tool_ok
)
)
}
tool_task <- vitals::Task$new(
dataset = tool_data,
solver = rbebelm_tool_solver,
scorer = tool_scorer,
name = "rbebelm-tool-use"
)
tool_eval <- run_task(tool_task, model = model, expected_tool = tool_data$expected_tool)
tool_eval$metrics
#> accuracy
#> 66.66667
data.frame(
sample = seq_len(nrow(tool_eval$samples)),
target = tool_eval$samples$target,
result = excerpt(tool_eval$samples$result),
score = as.character(tool_eval$samples$score),
answer_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$answer_ok), logical(1)),
tool_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$tool_ok), logical(1)),
calls = vapply(tool_eval$samples$solver_metadata, function(x) paste(x$calls, collapse = " | "), character(1)),
explanation = tool_eval$samples$scorer_explanation,
row.names = NULL
)
#> sample target
#> 1 1 Bamako
#> 2 2 XOF
#> 3 3 Rome
#> result
#> 1 ANSWER: Bamako
#> 2 <tool_call> {"name": "lookup_currency", "arguments": {"country": "Mali"}} </tool_call> <tool_result> {"currency": "Mali ...
#> 3 ANSWER: Rome
#> score answer_ok tool_ok calls
#> 1 C TRUE TRUE lookup_capital:Mali | lookup_capital:Mali
#> 2 I FALSE FALSE
#> 3 C TRUE TRUE lookup_capital:Italy
#> explanation
#> 1 expected tool and answer observed
#> 2 missing expected tool call or final answer
#> 3 expected tool and answer observedThis task checks simple constrained formatting. It is intentionally small; add rows and epochs to make it more robust.
instruction_data <- data.frame(
input = c(
"Return exactly three comma-separated lowercase colors and nothing else.",
"Return a JSON object with keys city and country for Bamako, and nothing else.",
"Return exactly one line beginning with ANSWER: followed by the word ready."
),
target = c("comma_colors", "json_city_country", "answer_ready")
)
rbebelm_instruction_solver <- function(inputs, model, ...) {
result <- character(length(inputs))
metadata <- vector("list", length(inputs))
for (i in seq_along(inputs)) {
agent <- bebel_agent(model, greedy = TRUE, max_gen = 80, max_think = 0)
bebel_append_user(agent, inputs[[i]])
turn <- bebel_assistant_turn(agent, on_event = NULL)
result[[i]] <- turn$text
metadata[[i]] <- list(chars = nchar(turn$text))
}
list(
result = result,
solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
solver_metadata = metadata
)
}
instruction_scorer <- function(samples) {
ok <- logical(nrow(samples))
for (i in seq_len(nrow(samples))) {
x <- trimws(samples$result[[i]])
ok[[i]] <- switch(
samples$target[[i]],
comma_colors = grepl("^[a-z]+,\\s*[a-z]+,\\s*[a-z]+$", x),
json_city_country = grepl('^\\s*\\{.*"city"\\s*:', x) && grepl('"country"\\s*:', x),
answer_ready = grepl("^ANSWER:\\s*ready\\s*$", x, ignore.case = TRUE),
FALSE
)
}
list(
score = score_factor(ok),
explanation = ifelse(ok, "format matched", "format did not match")
)
}
instruction_task <- vitals::Task$new(
dataset = instruction_data,
solver = rbebelm_instruction_solver,
scorer = instruction_scorer,
name = "rbebelm-instruction-following"
)
instruction_eval <- run_task(instruction_task, model = model)
instruction_eval$metrics
#> accuracy
#> 33.33333
data.frame(
sample = seq_len(nrow(instruction_eval$samples)),
target = instruction_eval$samples$target,
result = excerpt(instruction_eval$samples$result),
score = as.character(instruction_eval$samples$score),
explanation = instruction_eval$samples$scorer_explanation,
row.names = NULL
)
#> sample target
#> 1 1 comma_colors
#> 2 2 json_city_country
#> 3 3 answer_ready
#> result
#> 1 { "output": [ "red", "green", "blue" ] }
#> 2 { "city": "Bamako", "country": "Mali" } </think> { "city": "Bamako", "country": "Mali" }
#> 3 <REASONING> The user's request is to "solve the following problem" and then "return exactly one line beginning with ANSW...
#> score explanation
#> 1 I format did not match
#> 2 C format matched
#> 3 I format did not matchThese tasks evaluate observed model behavior under this GGUF, prompt
set, backend, and decoding configuration. They do not verify the
architecture, training token count, RL recipe, published benchmark
numbers, or license terms. To make the evaluation stronger, add more
rows, set epochs > 1, vary decoding parameters, and
compare against other local or API-backed models with the same
vitals task definitions.