Model evaluation with vitals

This vignette shows how to evaluate the local Liquid AI LFM2.5-8B-A1B GGUF with vitals. The goal is to evaluate model behavior on small, reproducible tasks, including tool use. It is not a reproduction of Liquid AI’s benchmark harnesses.

The source is in vignettes-raw/; the rendered file in vignettes/ is precompiled with rawvignette so R CMD check, pkgdown, and package installs do not rerun the model.

library(Rbebelm)
data.frame(
  model_file = if (has_weights) basename(weights_file) else NA_character_,
  has_weights = has_weights,
  has_vitals = has_vitals
)
#>                  model_file has_weights has_vitals
#> 1 LFM2.5-8B-A1B-Q4_K_M.gguf        TRUE       TRUE

If vitals is not installed, install it before regenerating this vignette:

install.packages("vitals")
# or: pak::pak("tidyverse/vitals")

Evaluation harness

vitals::Task expects solvers to return result and solver_chat. Rbebelm is not yet an ellmer provider, so the helper below returns a minimal placeholder object with class "Chat" for vitals bookkeeping while storing Rbebelm-specific trace data in solver_metadata. The task is solved, scored, and measured; it is not logged to the Inspect viewer in this vignette.

library(vitals)

model <- bebel_model_load(weights_file, num_threads = num_threads)

rbebelm_vitals_chat <- function(model_name = "Rbebelm/LFM2.5-8B-A1B-GGUF") {
  structure(
    list(
      get_model = function() model_name,
      get_system_prompt = function() "",
      get_turns = function() list()
    ),
    class = "Chat"
  )
}

score_factor <- function(ok) {
  factor(ifelse(ok, "C", "I"), levels = c("I", "C"), ordered = TRUE)
}

vitals::vitals_log_dir_set(tempdir())

excerpt <- function(x, n = 120) {
  x <- gsub("\\s+", " ", x)
  ifelse(nchar(x) > n, paste0(substr(x, 1, n), "..."), x)
}

or_unknown <- function(x) {
  if (is.null(x) || length(x) == 0L || is.na(x)) "unknown" else x
}

run_task <- function(task, ...) {
  task$solve(...)
  task$score()
  task$measure()
  list(samples = task$get_samples(), metrics = task$metrics)
}

Factual QA task

This task checks short deterministic factual answers. The scorer extracts an ANSWER: line and compares it to the target.

factual_data <- data.frame(
  input = c(
    "Return only 'ANSWER: Bamako'. What is the capital of Mali?",
    "Return only 'ANSWER: Rome'. What is the capital of Italy?",
    "Return only 'ANSWER: Tokyo'. What is the capital of Japan?"
  ),
  target = c("Bamako", "Rome", "Tokyo")
)

rbebelm_qa_solver <- function(inputs, model, max_gen = 48, ...) {
  result <- character(length(inputs))
  metadata <- vector("list", length(inputs))
  for (i in seq_along(inputs)) {
    agent <- bebel_agent(model, greedy = TRUE, max_gen = max_gen, max_think = 0)
    bebel_append_user(agent, inputs[[i]])
    turn <- bebel_assistant_turn(agent, on_event = NULL)
    result[[i]] <- turn$text
    metadata[[i]] <- list(
      chars = nchar(turn$text),
      history_tokens = bebel_agent_info(agent)$history_tokens
    )
  }
  list(
    result = result,
    solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
    solver_metadata = metadata
  )
}

answer_line_scorer <- function(samples) {
  extracted <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE)
  matched <- grepl("ANSWER:", samples$result, ignore.case = TRUE) &
    trimws(tolower(extracted)) == trimws(tolower(samples$target))
  list(
    score = score_factor(matched),
    explanation = ifelse(matched, "matched ANSWER line", "missing or incorrect ANSWER line"),
    scorer_metadata = Map(function(answer, target) list(answer = answer, target = target), extracted, samples$target)
  )
}

qa_task <- vitals::Task$new(
  dataset = factual_data,
  solver = rbebelm_qa_solver,
  scorer = answer_line_scorer,
  name = "rbebelm-factual-qa"
)

qa_eval <- run_task(qa_task, model = model)
qa_eval$metrics
#> accuracy 
#>      100
data.frame(
  sample = seq_len(nrow(qa_eval$samples)),
  target = qa_eval$samples$target,
  result = excerpt(qa_eval$samples$result),
  score = as.character(qa_eval$samples$score),
  explanation = qa_eval$samples$scorer_explanation,
  row.names = NULL
)
#>   sample target                    result score         explanation
#> 1      1 Bamako < </think> ANSWER: Bamako     C matched ANSWER line
#> 2      2   Rome   < </think> ANSWER: Rome     C matched ANSWER line
#> 3      3  Tokyo  < </think> ANSWER: Tokyo     C matched ANSWER line

Tool-use task

This task evaluates whether the model emits the requested tool call and whether the agent loop reaches the expected final answer after tool execution. The tools return values from a private R context, so the answer cannot be obtained by calling an external service.

tool_data <- data.frame(
  input = c(
    "Do not answer from memory. Emit exactly [lookup_capital(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: <tool result>'.",
    "Do not answer from memory. Emit exactly [lookup_currency(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: <tool result>'.",
    "Do not answer from memory. Emit exactly [lookup_capital(country=\"Italy\")]. After the tool result, answer exactly 'ANSWER: <tool result>'."
  ),
  target = c("Bamako", "XOF", "Rome"),
  expected_tool = c("lookup_capital", "lookup_currency", "lookup_capital")
)

rbebelm_tool_solver <- function(inputs, model, expected_tool, max_steps = 3, ...) {
  result <- character(length(inputs))
  metadata <- vector("list", length(inputs))

  for (i in seq_along(inputs)) {
    context <- new.env(parent = emptyenv())
    context$calls <- character()

    lookup_capital <- bebel_tool("lookup_capital", function(args, context) {
      country <- args$country
      context$calls <- c(context$calls, paste0("lookup_capital:", country))
      or_unknown(c(Mali = "Bamako", Italy = "Rome", Japan = "Tokyo")[[country]])
    })
    lookup_currency <- bebel_tool("lookup_currency", function(args, context) {
      country <- args$country
      context$calls <- c(context$calls, paste0("lookup_currency:", country))
      or_unknown(c(Mali = "XOF", Japan = "yen", Italy = "euro")[[country]])
    })

    agent <- bebel_agent(model, greedy = TRUE, max_gen = 128, max_think = 0)
    bebel_append_user(agent, inputs[[i]])
    run <- tryCatch(
      bebel_agent_run(
        agent,
        tools = list(lookup_capital, lookup_currency),
        context = context,
        max_steps = max_steps
      ),
      error = function(e) e
    )

    if (inherits(run, "error")) {
      result[[i]] <- bebel_transcript(agent)
      loop_error <- conditionMessage(run)
    } else {
      result[[i]] <- or_unknown(tail(run$turns, 1)[[1]]$text)
      if (identical(result[[i]], "unknown")) result[[i]] <- bebel_transcript(agent)
      loop_error <- NA_character_
    }
    metadata[[i]] <- list(
      calls = context$calls,
      call_count = length(context$calls),
      expected_tool = expected_tool[[i]],
      expected_tool_called = any(startsWith(context$calls, paste0(expected_tool[[i]], ":"))),
      loop_error = loop_error
    )
  }

  list(
    result = result,
    solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
    solver_metadata = metadata
  )
}

tool_scorer <- function(samples) {
  metadata <- samples$solver_metadata
  answer <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE)
  answer_ok <- grepl("ANSWER:", samples$result, ignore.case = TRUE) &
    trimws(tolower(answer)) == trimws(tolower(samples$target))
  tool_ok <- vapply(metadata, function(x) isTRUE(x$expected_tool_called), logical(1))
  ok <- answer_ok & tool_ok
  list(
    score = score_factor(ok),
    explanation = ifelse(ok, "expected tool and answer observed", "missing expected tool call or final answer"),
    scorer_metadata = Map(
      function(answer, answer_ok, tool_ok) list(answer = answer, answer_ok = answer_ok, tool_ok = tool_ok),
      answer, answer_ok, tool_ok
    )
  )
}

tool_task <- vitals::Task$new(
  dataset = tool_data,
  solver = rbebelm_tool_solver,
  scorer = tool_scorer,
  name = "rbebelm-tool-use"
)

tool_eval <- run_task(tool_task, model = model, expected_tool = tool_data$expected_tool)
tool_eval$metrics
#> accuracy 
#> 66.66667
data.frame(
  sample = seq_len(nrow(tool_eval$samples)),
  target = tool_eval$samples$target,
  result = excerpt(tool_eval$samples$result),
  score = as.character(tool_eval$samples$score),
  answer_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$answer_ok), logical(1)),
  tool_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$tool_ok), logical(1)),
  calls = vapply(tool_eval$samples$solver_metadata, function(x) paste(x$calls, collapse = " | "), character(1)),
  explanation = tool_eval$samples$scorer_explanation,
  row.names = NULL
)
#>   sample target
#> 1      1 Bamako
#> 2      2    XOF
#> 3      3   Rome
#>                                                                                                                        result
#> 1                                                                                                              ANSWER: Bamako
#> 2 <tool_call> {"name": "lookup_currency", "arguments": {"country": "Mali"}} </tool_call> <tool_result> {"currency": "Mali ...
#> 3                                                                                                                ANSWER: Rome
#>   score answer_ok tool_ok                                     calls
#> 1     C      TRUE    TRUE lookup_capital:Mali | lookup_capital:Mali
#> 2     I     FALSE   FALSE                                          
#> 3     C      TRUE    TRUE                      lookup_capital:Italy
#>                                  explanation
#> 1          expected tool and answer observed
#> 2 missing expected tool call or final answer
#> 3          expected tool and answer observed

Instruction-following task

This task checks simple constrained formatting. It is intentionally small; add rows and epochs to make it more robust.

instruction_data <- data.frame(
  input = c(
    "Return exactly three comma-separated lowercase colors and nothing else.",
    "Return a JSON object with keys city and country for Bamako, and nothing else.",
    "Return exactly one line beginning with ANSWER: followed by the word ready."
  ),
  target = c("comma_colors", "json_city_country", "answer_ready")
)

rbebelm_instruction_solver <- function(inputs, model, ...) {
  result <- character(length(inputs))
  metadata <- vector("list", length(inputs))
  for (i in seq_along(inputs)) {
    agent <- bebel_agent(model, greedy = TRUE, max_gen = 80, max_think = 0)
    bebel_append_user(agent, inputs[[i]])
    turn <- bebel_assistant_turn(agent, on_event = NULL)
    result[[i]] <- turn$text
    metadata[[i]] <- list(chars = nchar(turn$text))
  }
  list(
    result = result,
    solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()),
    solver_metadata = metadata
  )
}

instruction_scorer <- function(samples) {
  ok <- logical(nrow(samples))
  for (i in seq_len(nrow(samples))) {
    x <- trimws(samples$result[[i]])
    ok[[i]] <- switch(
      samples$target[[i]],
      comma_colors = grepl("^[a-z]+,\\s*[a-z]+,\\s*[a-z]+$", x),
      json_city_country = grepl('^\\s*\\{.*"city"\\s*:', x) && grepl('"country"\\s*:', x),
      answer_ready = grepl("^ANSWER:\\s*ready\\s*$", x, ignore.case = TRUE),
      FALSE
    )
  }
  list(
    score = score_factor(ok),
    explanation = ifelse(ok, "format matched", "format did not match")
  )
}

instruction_task <- vitals::Task$new(
  dataset = instruction_data,
  solver = rbebelm_instruction_solver,
  scorer = instruction_scorer,
  name = "rbebelm-instruction-following"
)

instruction_eval <- run_task(instruction_task, model = model)
instruction_eval$metrics
#> accuracy 
#> 33.33333
data.frame(
  sample = seq_len(nrow(instruction_eval$samples)),
  target = instruction_eval$samples$target,
  result = excerpt(instruction_eval$samples$result),
  score = as.character(instruction_eval$samples$score),
  explanation = instruction_eval$samples$scorer_explanation,
  row.names = NULL
)
#>   sample            target
#> 1      1      comma_colors
#> 2      2 json_city_country
#> 3      3      answer_ready
#>                                                                                                                        result
#> 1                                                                                    { "output": [ "red", "green", "blue" ] }
#> 2                                    { "city": "Bamako", "country": "Mali" } </think> { "city": "Bamako", "country": "Mali" }
#> 3 <REASONING> The user's request is to "solve the following problem" and then "return exactly one line beginning with ANSW...
#>   score          explanation
#> 1     I format did not match
#> 2     C       format matched
#> 3     I format did not match

Interpreting the results

These tasks evaluate observed model behavior under this GGUF, prompt set, backend, and decoding configuration. They do not verify the architecture, training token count, RL recipe, published benchmark numbers, or license terms. To make the evaluation stronger, add more rows, set epochs > 1, vary decoding parameters, and compare against other local or API-backed models with the same vitals task definitions.