代码拉取完成,页面将自动刷新
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="next" href="developing/">
<link rel="icon" href="assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.14">
<title>Welcome - BenchmarkQED</title>
<link rel="stylesheet" href="assets/stylesheets/main.342714a4.min.css">
<link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
<script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js"></script>
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#welcome-to-benchmarkqed" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="." title="BenchmarkQED" class="md-header__button md-logo" aria-label="BenchmarkQED" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19.375 8.5a3.25 3.25 0 1 1-3.163 4h-3a3.252 3.252 0 0 1-4.443 2.509L7.214 17.76a3.25 3.25 0 1 1-1.342-.674l1.672-2.957A3.24 3.24 0 0 1 6.75 12c0-.907.371-1.727.97-2.316L6.117 6.846A3.253 3.253 0 0 1 1.875 3.75a3.25 3.25 0 1 1 5.526 2.32l1.603 2.836A3.25 3.25 0 0 1 13.093 11h3.119a3.25 3.25 0 0 1 3.163-2.5M10 10.25a1.75 1.75 0 1 0-.001 3.499A1.75 1.75 0 0 0 10 10.25M5.125 2a1.75 1.75 0 1 0 0 3.5 1.75 1.75 0 0 0 0-3.5m12.5 9.75a1.75 1.75 0 1 0 3.5 0 1.75 1.75 0 0 0-3.5 0m-14.25 8.5a1.75 1.75 0 1 0 3.501-.001 1.75 1.75 0 0 0-3.501.001"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
BenchmarkQED
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Welcome
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/microsoft/benchmark-qed" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
</div>
<div class="md-source__repository">
benchmark-qed
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item md-tabs__item--active">
<a href="." class="md-tabs__link">
Home
</a>
</li>
<li class="md-tabs__item">
<a href="cli/autoq/" class="md-tabs__link">
CLI
</a>
</li>
<li class="md-tabs__item">
<a href="notebooks/autoq/" class="md-tabs__link">
Notebooks
</a>
</li>
<li class="md-tabs__item">
<a href="datasets/" class="md-tabs__link">
Datasets
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="." title="BenchmarkQED" class="md-nav__button md-logo" aria-label="BenchmarkQED" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19.375 8.5a3.25 3.25 0 1 1-3.163 4h-3a3.252 3.252 0 0 1-4.443 2.509L7.214 17.76a3.25 3.25 0 1 1-1.342-.674l1.672-2.957A3.24 3.24 0 0 1 6.75 12c0-.907.371-1.727.97-2.316L6.117 6.846A3.253 3.253 0 0 1 1.875 3.75a3.25 3.25 0 1 1 5.526 2.32l1.603 2.836A3.25 3.25 0 0 1 13.093 11h3.119a3.25 3.25 0 0 1 3.163-2.5M10 10.25a1.75 1.75 0 1 0-.001 3.499A1.75 1.75 0 0 0 10 10.25M5.125 2a1.75 1.75 0 1 0 0 3.5 1.75 1.75 0 0 0 0-3.5m12.5 9.75a1.75 1.75 0 1 0 3.5 0 1.75 1.75 0 0 0-3.5 0m-14.25 8.5a1.75 1.75 0 1 0 3.501-.001 1.75 1.75 0 0 0-3.501.001"/></svg>
</a>
BenchmarkQED
</label>
<div class="md-nav__source">
<a href="https://github.com/microsoft/benchmark-qed" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
</div>
<div class="md-source__repository">
benchmark-qed
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1" checked>
<label class="md-nav__link" for="__nav_1" id="__nav_1_label" tabindex="">
<span class="md-ellipsis">
Home
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_1_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_1">
<span class="md-nav__icon md-icon"></span>
Home
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Welcome
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="." class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Welcome
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#getting-started" class="md-nav__link">
<span class="md-ellipsis">
Getting Started
</span>
</a>
<nav class="md-nav" aria-label="Getting Started">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#installation-instructions" class="md-nav__link">
<span class="md-ellipsis">
Installation Instructions
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#usage" class="md-nav__link">
<span class="md-ellipsis">
Usage
</span>
</a>
<nav class="md-nav" aria-label="Usage">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#autoq" class="md-nav__link">
<span class="md-ellipsis">
AutoQ
</span>
</a>
<nav class="md-nav" aria-label="AutoQ">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#example" class="md-nav__link">
<span class="md-ellipsis">
Example
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#autoe" class="md-nav__link">
<span class="md-ellipsis">
AutoE
</span>
</a>
<nav class="md-nav" aria-label="AutoE">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#example-1-relative-comparison-of-rag-methods" class="md-nav__link">
<span class="md-ellipsis">
Example 1: Relative comparison of RAG methods
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#example-2-scoring-of-rag-answers-against-reference-answers" class="md-nav__link">
<span class="md-ellipsis">
Example 2: Scoring of RAG answers against reference answers
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#autod" class="md-nav__link">
<span class="md-ellipsis">
AutoD
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="developing/" class="md-nav__link">
<span class="md-ellipsis">
Developing
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
CLI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
CLI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="cli/autoq/" class="md-nav__link">
<span class="md-ellipsis">
AutoQ
</span>
</a>
</li>
<li class="md-nav__item">
<a href="cli/autoe/" class="md-nav__link">
<span class="md-ellipsis">
AutoE
</span>
</a>
</li>
<li class="md-nav__item">
<a href="cli/llm_config/" class="md-nav__link">
<span class="md-ellipsis">
LLM Configuration
</span>
</a>
</li>
<li class="md-nav__item">
<a href="cli/config_init/" class="md-nav__link">
<span class="md-ellipsis">
Config Init
</span>
</a>
</li>
<li class="md-nav__item">
<a href="cli/data/" class="md-nav__link">
<span class="md-ellipsis">
Dataset Download
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Notebooks
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Notebooks
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="notebooks/autoq/" class="md-nav__link">
<span class="md-ellipsis">
AutoQ
</span>
</a>
</li>
<li class="md-nav__item">
<a href="notebooks/autoe/" class="md-nav__link">
<span class="md-ellipsis">
AutoE
</span>
</a>
</li>
<li class="md-nav__item">
<a href="notebooks/autod/" class="md-nav__link">
<span class="md-ellipsis">
AutoD
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="datasets/" class="md-nav__link">
<span class="md-ellipsis">
Datasets
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#getting-started" class="md-nav__link">
<span class="md-ellipsis">
Getting Started
</span>
</a>
<nav class="md-nav" aria-label="Getting Started">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#installation-instructions" class="md-nav__link">
<span class="md-ellipsis">
Installation Instructions
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#usage" class="md-nav__link">
<span class="md-ellipsis">
Usage
</span>
</a>
<nav class="md-nav" aria-label="Usage">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#autoq" class="md-nav__link">
<span class="md-ellipsis">
AutoQ
</span>
</a>
<nav class="md-nav" aria-label="AutoQ">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#example" class="md-nav__link">
<span class="md-ellipsis">
Example
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#autoe" class="md-nav__link">
<span class="md-ellipsis">
AutoE
</span>
</a>
<nav class="md-nav" aria-label="AutoE">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#example-1-relative-comparison-of-rag-methods" class="md-nav__link">
<span class="md-ellipsis">
Example 1: Relative comparison of RAG methods
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#example-2-scoring-of-rag-answers-against-reference-answers" class="md-nav__link">
<span class="md-ellipsis">
Example 2: Scoring of RAG answers against reference answers
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#autod" class="md-nav__link">
<span class="md-ellipsis">
AutoD
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="welcome-to-benchmarkqed">Welcome to BenchmarkQED</h1>
<pre class="mermaid"><code>flowchart LR
AutoQ["<span style='font-size:1.5em; color:black'><b>AutoQ</b></span><br>LLM synthesis of<br>local-to-global<br>queries for target<br>datasets"] -- creates queries <br>for evaluation --> AutoE["<span style='font-size:1.5em; color:black'><b>AutoE</b></span><br>LLM evaluation of<br>relative answer <br>quality on target <br>metrics"]
AutoE ~~~ AutoD["<span style='font-size:1.5em; color:black'><b>AutoD</b></span><br>LLM summarization<br>of datasets samples<br>to a curated target<br>structures"]
AutoD -- curates datasets <br>for evaluation --> AutoE
AutoD -- creates dataset summaries <br>for query synthesis --> AutoQ
style AutoQ fill:#a8d0ed,color:black,font-weight:normal
style AutoE fill:#a8d0ed,color:black,font-weight:normal
style AutoD fill:#a8d0ed,color:black,font-weight:normal
linkStyle 0 stroke:#0077b6,stroke-width:2px
linkStyle 2 stroke:#0077b6,stroke-width:2px
linkStyle 3 stroke:#0077b6,stroke-width:2px</code></pre>
<p>BenchmarkQED is a suite of tools designed for automated benchmarking of retrieval-augmented generation (RAG) systems. It provides components for query generation, evaluation, and dataset preparation to facilitate reproducible testing at scale.</p>
<h2 id="getting-started">Getting Started</h2>
<h3 id="installation-instructions">Installation Instructions</h3>
<p>Install <a href="https://www.python.org/downloads/">Python 3.11+</a></p>
<p>To get started with BenchmarkQED, you have two options:</p>
<ol>
<li><a href="https://pypi.org/project/benchmark-qed/">Install from PyPI</a>:
<div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>pip<span class="w"> </span>install<span class="w"> </span>benchmark-qed
</code></pre></div></li>
<li><a href="developing/">Use it from source</a></li>
</ol>
<h2 id="usage">Usage</h2>
<p>The sections below describe the three main components of BenchmarkQED—AutoQ, AutoE, and AutoD. You will also find step-by-step examples demonstrating how to use AutoQ and AutoE, using the Install from PyPI option.</p>
<h3 id="autoq">AutoQ</h3>
<p><img alt="AutoQ diagram" src="images/AutoQ.png" /></p>
<p>The AutoQ component generates four synthetic query classes based on the scope and source of the dataset. </p>
<ul>
<li>
<p><em>Query Scope</em>: the extent of the dataset that the question addresses</p>
<ul>
<li><em>Local</em> queries targeting specific details of a text corpus (e.g., <em>"What are the public health implications of the Alaskapox virus in Alaska?"</em>)</li>
<li><em>Global</em> queries targeting general aspects of a text corpus such as common themes, trends, concerns (e.g., <em>"Across the dataset, what are the main public health initiatives mentioned that target underserved communities?"</em>)</li>
</ul>
</li>
<li>
<p><em>Query Source</em>: the information used to generated local and global queries</p>
<ul>
<li><em>Data-driven</em> queries based on text sampled from the overall corpus</li>
<li><em>Activity-driven</em> queries based on potential activities consistent with the data</li>
</ul>
</li>
</ul>
<p>AutoQ can be configured to generate any number and distribution of synthetic queries along these classes.</p>
<blockquote>
<p><strong>Note:</strong> AutoQ generates queries only; it does <strong>not</strong> produce reference (ground truth) answers for these queries.</p>
</blockquote>
<h4 id="example">Example</h4>
<p>Please follow these steps to generate synthetic queries from the <a href="https://github.com/microsoft/benchmark-qed/tree/main/datasets/AP_news/raw_data">AP news dataset</a>:</p>
<ol>
<li>
<p><strong>Set up your project directory:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>mkdir<span class="w"> </span>-p<span class="w"> </span>./local/autoq_test
<a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="nb">cd</span><span class="w"> </span>./local/autoq_test
</code></pre></div></p>
</li>
<li>
<p><strong>Download the AP news dataset into the <code>input</code> subfolder</strong>:
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a>mkdir<span class="w"> </span>./input
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a>benchmark-qed<span class="w"> </span>data<span class="w"> </span>download<span class="w"> </span>AP_news<span class="w"> </span>input
</code></pre></div>
Alternatively, you can manually download the files from the <a href="https://github.com/microsoft/benchmark-qed/tree/main/datasets/AP_news/raw_data">AP News datasets folder</a>.</p>
</li>
<li>
<p><strong>Create a configuration file:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a>benchmark-qed<span class="w"> </span>config<span class="w"> </span>init<span class="w"> </span>autoq<span class="w"> </span>.
</code></pre></div>
This command creates two files in the <code>./autoq_test</code> directory:</p>
<ul>
<li><code>.env</code>: Contains environment variables for the AutoQ pipeline. Open this file and replace <code><API_KEY></code> with your own OpenAI or Azure API key.</li>
<li><code>settings.yaml</code>: Contains pipeline settings, which you can modify as needed.</li>
</ul>
</li>
<li>
<p><strong>Generate all synthetic query classes:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>benchmark-qed<span class="w"> </span>autoq<span class="w"> </span>settings.yaml<span class="w"> </span>output
</code></pre></div></p>
</li>
</ol>
<p>For detailed instructions on configuring and running AutoQ from the command line, see the <a href="cli/autoq/">AutoQ CLI Documentation</a>.</p>
<p>To learn more about the query synthesis process and using AutoQ programmatically, refer to the <a href="notebooks/autoq/">AutoQ Notebook Example</a>.</p>
<h3 id="autoe">AutoE</h3>
<p>AutoE automates the evaluation of RAG methods using the LLM-as-a-Judge approach. It takes as input a set of queries (created by AutoQ or other sources) and corresponding answers generated by RAG systems (produced outside of BenchmarkQED). For each query, AutoE presents an LLM with pairs of answers (along with the query and target metric) in a counterbalanced order, and the model judges whether the first answer wins, loses, or ties with the second. Aggregating these judgments across multiple queries and trials yields <strong>win rates</strong> for each method. By default, AutoE compares RAG answers using <a href="https://github.com/microsoft/benchmark-qed/blob/799b78b6716a8f24fcd354b89a37b429ba1e587a/benchmark_qed/config/model/score.py#L28">four quality metrics</a>: relevance, comprehensiveness, diversity, and empowerment. Users can also define and configure custom evaluation metrics as needed.</p>
<p>When reference answers (such as ground truth or "gold standard" responses) are available, AutoE can evaluate RAG-generated answers against these references using <a href="https://github.com/microsoft/benchmark-qed/blob/799b78b6716a8f24fcd354b89a37b429ba1e587a/benchmark_qed/config/model/score.py#L50">default metrics</a> like correctness, completeness, or other user-defined criteria on a customizable scoring scale.</p>
<blockquote>
<p><strong>Choosing the Right LLM Judge</strong></p>
<p>Selecting an appropriate LLM judge is <strong>crucial</strong> for reliable evaluation. Less capable models can introduce biases or produce unreliable results. To validate your judge model, start with an <strong>A/A test</strong>—compare a RAG method against itself. The expected outcome is a win rate of 0.5, with no statistically significant difference between the two sets of answers. Additionally, manually review the LLM’s scoring and reasoning to spot any systematic errors or biases in its judgments.</p>
</blockquote>
<h4 id="example-1-relative-comparison-of-rag-methods">Example 1: Relative comparison of RAG methods</h4>
<p>Please follow these steps to perform a relative comparison of RAG methods using example question-answer data generated from the <a href="https://github.com/microsoft/benchmark-qed/tree/main/docs/notebooks/example_answers">AP news dataset</a>:</p>
<ol>
<li>
<p><strong>Set up your project directory:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>mkdir<span class="w"> </span>-p<span class="w"> </span>./local/pairwise_test
<a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a><span class="nb">cd</span><span class="w"> </span>./local/pairwise_test
</code></pre></div></p>
</li>
<li>
<p><strong>Download the RAG answers into the <code>input</code> subfolder</strong>:
<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>mkdir<span class="w"> </span>./input
<a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>benchmark-qed<span class="w"> </span>data<span class="w"> </span>download<span class="w"> </span>example_answers<span class="w"> </span>input
</code></pre></div>
Alternatively, you can manually copy the files inside the <a href="https://github.com/microsoft/benchmark-qed/tree/main/docs/notebooks/example_answers">example answers folder</a>.</p>
</li>
<li>
<p><strong>Create a configuration file for pairwise comparison:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a>benchmark-qed<span class="w"> </span>config<span class="w"> </span>init<span class="w"> </span>autoe_pairwise<span class="w"> </span>.
</code></pre></div>
This command creates two files in the <code>./pairwise_test</code> directory:</p>
<ul>
<li><code>.env</code>: Contains environment variables for the pairwise comparison tests. Open this file and replace <code><API_KEY></code> with your own OpenAI or Azure API key.</li>
<li><code>settings.yaml</code>: Contains pipeline settings, which you can modify as needed.</li>
</ul>
</li>
<li>
<p><strong>Run the pairwise comparison:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a>benchmark-qed<span class="w"> </span>autoe<span class="w"> </span>pairwise-scores<span class="w"> </span>settings.yaml<span class="w"> </span>output
</code></pre></div></p>
</li>
</ol>
<h4 id="example-2-scoring-of-rag-answers-against-reference-answers">Example 2: Scoring of RAG answers against reference answers</h4>
<p>Please follow these steps to score RAG answers against reference answers using example data from the <a href="https://github.com/microsoft/benchmark-qed/tree/main/docs/notebooks/example_answers">AP news dataset</a>:</p>
<ol>
<li>
<p><strong>Set up your project directory:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-9-1" name="__codelineno-9-1" href="#__codelineno-9-1"></a>mkdir<span class="w"> </span>-p<span class="w"> </span>./local/reference_test
<a id="__codelineno-9-2" name="__codelineno-9-2" href="#__codelineno-9-2"></a><span class="nb">cd</span><span class="w"> </span>./local/reference_test
</code></pre></div></p>
</li>
<li>
<p><strong>Download the RAG answers and reference answers into the <code>input</code> subfolder</strong>:
<div class="highlight"><pre><span></span><code><a id="__codelineno-10-1" name="__codelineno-10-1" href="#__codelineno-10-1"></a>mkdir<span class="w"> </span>./input
<a id="__codelineno-10-2" name="__codelineno-10-2" href="#__codelineno-10-2"></a>benchmark-qed<span class="w"> </span>data<span class="w"> </span>download<span class="w"> </span>example_answers<span class="w"> </span>input
</code></pre></div>
Alternatively, you can manually copy the files inside the <a href="https://github.com/microsoft/benchmark-qed/tree/main/docs/notebooks/example_answers">example data folder</a>.</p>
</li>
<li>
<p><strong>Create a configuration file for reference scoring:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-11-1" name="__codelineno-11-1" href="#__codelineno-11-1"></a>benchmark-qed<span class="w"> </span>config<span class="w"> </span>init<span class="w"> </span>autoe_reference<span class="w"> </span>.
</code></pre></div>
This command creates two files in the <code>./reference_test</code> directory:</p>
<ul>
<li><code>.env</code>: Contains environment variables for the reference scoring tests. Open this file and replace <code><API_KEY></code> with your own OpenAI or Azure API key.</li>
<li><code>settings.yaml</code>: Contains pipeline settings, which you can modify as needed.</li>
</ul>
</li>
<li>
<p><strong>Run the reference scoring:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-12-1" name="__codelineno-12-1" href="#__codelineno-12-1"></a>benchmark-qed<span class="w"> </span>autoe<span class="w"> </span>reference-scores<span class="w"> </span>settings.yaml<span class="w"> </span>output
</code></pre></div>
For more details on configuring and running AutoE, see the <a href="cli/autoe/">AutoE CLI Documentation</a>.</p>
</li>
</ol>
<p>For detailed instructions on configuring and running AutoE subcommands, please refer to the <a href="cli/autoe/">AutoE CLI Documentation</a>.</p>
<p>To learn how to use AutoE programmatically, please see the <a href="notebooks/autoe/">AutoE Notebook Example</a>.</p>
<h3 id="autod">AutoD</h3>
<p>The AutoD component provides two main data utitilies:</p>
<ul>
<li>
<p>Data sampling: Samples datasets to meet a target specification, defined by the number of topic clusters (breadth) and the number of samples per cluster (depth)</p>
</li>
<li>
<p>Data summarization: Summarizes input or output datasets in a way that reflect their topic coverage, using a map-reduce process. These summaries play an important role in the AutoQ query synthesis process, but they can also be used more broadly, such as in prompts where context space is limited. </p>
</li>
</ul>
<p>To learn more about AutoD's data utilities, please see: <a href="notebooks/autod/">AutoD Notebook Example</a>.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="Footer" >
<a href="developing/" class="md-footer__link md-footer__link--next" aria-label="Next: Developing">
<div class="md-footer__title">
<span class="md-footer__direction">
Next
</span>
<div class="md-ellipsis">
Developing
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
© 2025 Microsoft | <a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a> | <a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Heath Privacy</a> | <a onclick="window.manageConsent();">Cookies</a> | <a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a> | <a href="https://www.microsoft.com/trademarks">Trademarks</a>
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": ".", "features": ["content.code.copy", "content.code.select", "navigation.footer", "navigation.tabs"], "search": "assets/javascripts/workers/search.d50fe291.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="assets/javascripts/bundle.13a4f30d.min.js"></script>
<script src="scripts/create_cookie_banner.js"></script>
</body>
</html>
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。