Galileo Experiments allow you to evaluate and improve your LLM applications by running tests against datasets and measuring performance using various metrics.
The simplest way to get started is by using a prompt template:
import{ createPromptTemplate, runExperiment }from"galileo";asyncfunctionrunPromptTemplateExperiment(){const template =awaitcreatePromptTemplate({ template:[{ role:"system", content:"You are a great storyteller."},{ role:"user", content:"Write a story about {{topic}}"},], projectName:"my-project", name:"storyteller-prompt",});awaitrunExperiment({ name:"story-experiment", datasetName:"storyteller-dataset", promptTemplate: template, metrics:["correctness"], projectName:"my-project",});}// Run the experimentrunPromptTemplateExperiment();
For more complex scenarios, you can use a runner function:
import{ runExperiment }from"galileo";import{ OpenAI }from"openai";asyncfunctionrunFunctionExperiment(){const openai =newOpenAI({ apiKey: process.env.OPENAI_API_KEY});construnner=async(input)=>{const result =await openai.chat.completions.create({ model:"gpt-4", messages:[{ role:"system", content:"You are a great storyteller."},{ role:"user", content:`Write a story about ${input["topic"]}`},],});return result;};awaitrunExperiment({ name:"story-function-experiment", datasetName:"storyteller-dataset", runner: runner, metrics:["correctness"], projectName:"my-project",});}// Run the experimentrunFunctionExperiment();
import{ runExperiment }from"galileo";import{ OpenAI }from"openai";asyncfunctionrunCustomDatasetExperiment(){const openai =newOpenAI({ apiKey: process.env.OPENAI_API_KEY});const dataset =[{ input:"Spain", expected:"Europe"}];construnner=async(input)=>{const result =await openai.chat.completions.create({ model:"gpt-4", messages:[{ role:"system", content:"You are a geography expert"},{ role:"user", content:`Which continent does the following country belong to: ${input["input"]}`,},],});return result;};awaitrunExperiment({ name:"geography-experiment", dataset: dataset,function: runner, metrics:["correctness"], projectName:"my-project",});}// Run the experimentrunCustomDatasetExperiment();