Galileo Experiments allow you to evaluate and improve your LLM applications by running tests against datasets and measuring performance using various metrics.
The simplest way to get started is by using a prompt template. In the runExperiment options below, datasetName expects a dataset that you created through either the console or the SDK. Ensure you have saved a dataset before running the experiment!
import{ createDataset, createPromptTemplate, runExperiment, getDataset }from"galileo";import{ MessageRole }from"galileo/dist/types/message.types";asyncfunctionrunPromptTemplateExperiment(){const projectName ="my-project";const template =awaitcreatePromptTemplate({ template:[{ role: MessageRole.system, content:"You are a geography expert. Respond with only the continent name."},{ role: MessageRole.user, content:"{{input}}"},], projectName: projectName, name:"geography-prompt",});awaitrunExperiment({ name:"geography-experiment", datasetName:"geography-dataset",// Make sure you have a dataset created first promptTemplate: template, promptSettings:{ max_tokens:256, model_alias:"GPT-4o", temperature:0.8,}, metrics:["correctness"], projectName: projectName,});}// Run the experimentrunPromptTemplateExperiment();
For more complex scenarios, you can use a runner function. In this case, you may use either a saved dataset or a custom one.
import{ runExperiment }from"galileo";import{ OpenAI }from"openai";asyncfunctionrunFunctionExperiment(){const openai =newOpenAI({ apiKey: process.env.OPENAI_API_KEY});construnner=async(input:any)=>{const result =await openai.chat.completions.create({ model:"gpt-4", messages:[{ role:"system", content:"You are a great storyteller."},{ role:"user", content:`Write a story about ${input["topic"]}`},],});return[result.choices[0].message.content];};awaitrunExperiment({ name:"story-function-experiment", datasetName:"storyteller-dataset",function: runner, metrics:["correctness"], projectName:"my-project",});}// Run the experimentrunFunctionExperiment();
import{ runExperiment }from"galileo";import{ OpenAI }from"openai";asyncfunctionrunCustomDatasetExperiment(){const openai =newOpenAI({ apiKey: process.env.OPENAI_API_KEY});const dataset =[{ input:"Spain", output:"Europe"}];construnner=async(input:any)=>{const result =await openai.chat.completions.create({ model:"gpt-4", messages:[{ role:"system", content:"You are a geography expert"},{ role:"user", content:`Which continent does the following country belong to: ${input["input"]}`,},],});return[result.choices[0].message.content];};awaitrunExperiment({ name:"geography-experiment", dataset: dataset,function: runner, metrics:["correctness"], projectName:"my-project",});}// Run the experimentrunCustomDatasetExperiment();