--- title: "Model evaluation with vitals" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Model evaluation with vitals} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette shows how to evaluate the local [Liquid AI LFM2.5-8B-A1B](https://www.liquid.ai/blog/lfm2-5-8b-a1b) GGUF with [`vitals`](https://github.com/tidyverse/vitals). The goal is to evaluate model behavior on small, reproducible tasks, including tool use. It is not a reproduction of Liquid AI's benchmark harnesses. The source is in `vignettes-raw/`; the rendered file in `vignettes/` is precompiled with `rawvignette` so `R CMD check`, pkgdown, and package installs do not rerun the model. ``` r library(Rbebelm) data.frame( model_file = if (has_weights) basename(weights_file) else NA_character_, has_weights = has_weights, has_vitals = has_vitals ) #> model_file has_weights has_vitals #> 1 LFM2.5-8B-A1B-Q4_K_M.gguf TRUE TRUE ``` If `vitals` is not installed, install it before regenerating this vignette: ``` r install.packages("vitals") # or: pak::pak("tidyverse/vitals") ``` ## Evaluation harness `vitals::Task` expects solvers to return `result` and `solver_chat`. `Rbebelm` is not yet an ellmer provider, so the helper below returns a minimal placeholder object with class `"Chat"` for `vitals` bookkeeping while storing Rbebelm-specific trace data in `solver_metadata`. The task is solved, scored, and measured; it is not logged to the Inspect viewer in this vignette. ``` r library(vitals) model <- bebel_model_load(weights_file, num_threads = num_threads) rbebelm_vitals_chat <- function(model_name = "Rbebelm/LFM2.5-8B-A1B-GGUF") { structure( list( get_model = function() model_name, get_system_prompt = function() "", get_turns = function() list() ), class = "Chat" ) } score_factor <- function(ok) { factor(ifelse(ok, "C", "I"), levels = c("I", "C"), ordered = TRUE) } vitals::vitals_log_dir_set(tempdir()) excerpt <- function(x, n = 120) { x <- gsub("\\s+", " ", x) ifelse(nchar(x) > n, paste0(substr(x, 1, n), "..."), x) } or_unknown <- function(x) { if (is.null(x) || length(x) == 0L || is.na(x)) "unknown" else x } run_task <- function(task, ...) { task$solve(...) task$score() task$measure() list(samples = task$get_samples(), metrics = task$metrics) } ``` ## Factual QA task This task checks short deterministic factual answers. The scorer extracts an `ANSWER:` line and compares it to the target. ``` r factual_data <- data.frame( input = c( "Return only 'ANSWER: Bamako'. What is the capital of Mali?", "Return only 'ANSWER: Rome'. What is the capital of Italy?", "Return only 'ANSWER: Tokyo'. What is the capital of Japan?" ), target = c("Bamako", "Rome", "Tokyo") ) rbebelm_qa_solver <- function(inputs, model, max_gen = 48, ...) { result <- character(length(inputs)) metadata <- vector("list", length(inputs)) for (i in seq_along(inputs)) { agent <- bebel_agent(model, greedy = TRUE, max_gen = max_gen, max_think = 0) bebel_append_user(agent, inputs[[i]]) turn <- bebel_assistant_turn(agent, on_event = NULL) result[[i]] <- turn$text metadata[[i]] <- list( chars = nchar(turn$text), history_tokens = bebel_agent_info(agent)$history_tokens ) } list( result = result, solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()), solver_metadata = metadata ) } answer_line_scorer <- function(samples) { extracted <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE) matched <- grepl("ANSWER:", samples$result, ignore.case = TRUE) & trimws(tolower(extracted)) == trimws(tolower(samples$target)) list( score = score_factor(matched), explanation = ifelse(matched, "matched ANSWER line", "missing or incorrect ANSWER line"), scorer_metadata = Map(function(answer, target) list(answer = answer, target = target), extracted, samples$target) ) } qa_task <- vitals::Task$new( dataset = factual_data, solver = rbebelm_qa_solver, scorer = answer_line_scorer, name = "rbebelm-factual-qa" ) qa_eval <- run_task(qa_task, model = model) qa_eval$metrics #> accuracy #> 100 data.frame( sample = seq_len(nrow(qa_eval$samples)), target = qa_eval$samples$target, result = excerpt(qa_eval$samples$result), score = as.character(qa_eval$samples$score), explanation = qa_eval$samples$scorer_explanation, row.names = NULL ) #> sample target result score explanation #> 1 1 Bamako < ANSWER: Bamako C matched ANSWER line #> 2 2 Rome < ANSWER: Rome C matched ANSWER line #> 3 3 Tokyo < ANSWER: Tokyo C matched ANSWER line ``` ## Tool-use task This task evaluates whether the model emits the requested tool call and whether the agent loop reaches the expected final answer after tool execution. The tools return values from a private R context, so the answer cannot be obtained by calling an external service. ``` r tool_data <- data.frame( input = c( "Do not answer from memory. Emit exactly [lookup_capital(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: '.", "Do not answer from memory. Emit exactly [lookup_currency(country=\"Mali\")]. After the tool result, answer exactly 'ANSWER: '.", "Do not answer from memory. Emit exactly [lookup_capital(country=\"Italy\")]. After the tool result, answer exactly 'ANSWER: '." ), target = c("Bamako", "XOF", "Rome"), expected_tool = c("lookup_capital", "lookup_currency", "lookup_capital") ) rbebelm_tool_solver <- function(inputs, model, expected_tool, max_steps = 3, ...) { result <- character(length(inputs)) metadata <- vector("list", length(inputs)) for (i in seq_along(inputs)) { context <- new.env(parent = emptyenv()) context$calls <- character() lookup_capital <- bebel_tool("lookup_capital", function(args, context) { country <- args$country context$calls <- c(context$calls, paste0("lookup_capital:", country)) or_unknown(c(Mali = "Bamako", Italy = "Rome", Japan = "Tokyo")[[country]]) }) lookup_currency <- bebel_tool("lookup_currency", function(args, context) { country <- args$country context$calls <- c(context$calls, paste0("lookup_currency:", country)) or_unknown(c(Mali = "XOF", Japan = "yen", Italy = "euro")[[country]]) }) agent <- bebel_agent(model, greedy = TRUE, max_gen = 128, max_think = 0) bebel_append_user(agent, inputs[[i]]) run <- tryCatch( bebel_agent_run( agent, tools = list(lookup_capital, lookup_currency), context = context, max_steps = max_steps ), error = function(e) e ) if (inherits(run, "error")) { result[[i]] <- bebel_transcript(agent) loop_error <- conditionMessage(run) } else { result[[i]] <- or_unknown(tail(run$turns, 1)[[1]]$text) if (identical(result[[i]], "unknown")) result[[i]] <- bebel_transcript(agent) loop_error <- NA_character_ } metadata[[i]] <- list( calls = context$calls, call_count = length(context$calls), expected_tool = expected_tool[[i]], expected_tool_called = any(startsWith(context$calls, paste0(expected_tool[[i]], ":"))), loop_error = loop_error ) } list( result = result, solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()), solver_metadata = metadata ) } tool_scorer <- function(samples) { metadata <- samples$solver_metadata answer <- sub(".*ANSWER:\\s*([^\\n.]+).*", "\\1", samples$result, ignore.case = TRUE) answer_ok <- grepl("ANSWER:", samples$result, ignore.case = TRUE) & trimws(tolower(answer)) == trimws(tolower(samples$target)) tool_ok <- vapply(metadata, function(x) isTRUE(x$expected_tool_called), logical(1)) ok <- answer_ok & tool_ok list( score = score_factor(ok), explanation = ifelse(ok, "expected tool and answer observed", "missing expected tool call or final answer"), scorer_metadata = Map( function(answer, answer_ok, tool_ok) list(answer = answer, answer_ok = answer_ok, tool_ok = tool_ok), answer, answer_ok, tool_ok ) ) } tool_task <- vitals::Task$new( dataset = tool_data, solver = rbebelm_tool_solver, scorer = tool_scorer, name = "rbebelm-tool-use" ) tool_eval <- run_task(tool_task, model = model, expected_tool = tool_data$expected_tool) tool_eval$metrics #> accuracy #> 66.66667 data.frame( sample = seq_len(nrow(tool_eval$samples)), target = tool_eval$samples$target, result = excerpt(tool_eval$samples$result), score = as.character(tool_eval$samples$score), answer_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$answer_ok), logical(1)), tool_ok = vapply(tool_eval$samples$scorer_metadata, function(x) isTRUE(x$tool_ok), logical(1)), calls = vapply(tool_eval$samples$solver_metadata, function(x) paste(x$calls, collapse = " | "), character(1)), explanation = tool_eval$samples$scorer_explanation, row.names = NULL ) #> sample target #> 1 1 Bamako #> 2 2 XOF #> 3 3 Rome #> result #> 1 ANSWER: Bamako #> 2 {"name": "lookup_currency", "arguments": {"country": "Mali"}} {"currency": "Mali ... #> 3 ANSWER: Rome #> score answer_ok tool_ok calls #> 1 C TRUE TRUE lookup_capital:Mali | lookup_capital:Mali #> 2 I FALSE FALSE #> 3 C TRUE TRUE lookup_capital:Italy #> explanation #> 1 expected tool and answer observed #> 2 missing expected tool call or final answer #> 3 expected tool and answer observed ``` ## Instruction-following task This task checks simple constrained formatting. It is intentionally small; add rows and epochs to make it more robust. ``` r instruction_data <- data.frame( input = c( "Return exactly three comma-separated lowercase colors and nothing else.", "Return a JSON object with keys city and country for Bamako, and nothing else.", "Return exactly one line beginning with ANSWER: followed by the word ready." ), target = c("comma_colors", "json_city_country", "answer_ready") ) rbebelm_instruction_solver <- function(inputs, model, ...) { result <- character(length(inputs)) metadata <- vector("list", length(inputs)) for (i in seq_along(inputs)) { agent <- bebel_agent(model, greedy = TRUE, max_gen = 80, max_think = 0) bebel_append_user(agent, inputs[[i]]) turn <- bebel_assistant_turn(agent, on_event = NULL) result[[i]] <- turn$text metadata[[i]] <- list(chars = nchar(turn$text)) } list( result = result, solver_chat = lapply(inputs, function(...) rbebelm_vitals_chat()), solver_metadata = metadata ) } instruction_scorer <- function(samples) { ok <- logical(nrow(samples)) for (i in seq_len(nrow(samples))) { x <- trimws(samples$result[[i]]) ok[[i]] <- switch( samples$target[[i]], comma_colors = grepl("^[a-z]+,\\s*[a-z]+,\\s*[a-z]+$", x), json_city_country = grepl('^\\s*\\{.*"city"\\s*:', x) && grepl('"country"\\s*:', x), answer_ready = grepl("^ANSWER:\\s*ready\\s*$", x, ignore.case = TRUE), FALSE ) } list( score = score_factor(ok), explanation = ifelse(ok, "format matched", "format did not match") ) } instruction_task <- vitals::Task$new( dataset = instruction_data, solver = rbebelm_instruction_solver, scorer = instruction_scorer, name = "rbebelm-instruction-following" ) instruction_eval <- run_task(instruction_task, model = model) instruction_eval$metrics #> accuracy #> 33.33333 data.frame( sample = seq_len(nrow(instruction_eval$samples)), target = instruction_eval$samples$target, result = excerpt(instruction_eval$samples$result), score = as.character(instruction_eval$samples$score), explanation = instruction_eval$samples$scorer_explanation, row.names = NULL ) #> sample target #> 1 1 comma_colors #> 2 2 json_city_country #> 3 3 answer_ready #> result #> 1 { "output": [ "red", "green", "blue" ] } #> 2 { "city": "Bamako", "country": "Mali" } { "city": "Bamako", "country": "Mali" } #> 3 The user's request is to "solve the following problem" and then "return exactly one line beginning with ANSW... #> score explanation #> 1 I format did not match #> 2 C format matched #> 3 I format did not match ``` ## Interpreting the results These tasks evaluate observed model behavior under this GGUF, prompt set, backend, and decoding configuration. They do not verify the architecture, training token count, RL recipe, published benchmark numbers, or license terms. To make the evaluation stronger, add more rows, set `epochs > 1`, vary decoding parameters, and compare against other local or API-backed models with the same `vitals` task definitions.