Skip to content

Benchmark

A benchmark within mteb is essentially just a list of tasks along with some metadata about the benchmark.

An overview of the benchmark within mteb

This metadata includes a short description of the benchmark's intention, the reference, and the citation. If you use a benchmark from mteb, we recommend that you cite it along with mteb.

Utilities

mteb.get_benchmarks(names=None, display_on_leaderboard=None)

Get a list of benchmarks by name.

Parameters:

Name Type Description Default
names list[str] | None

A list of benchmark names to retrieve. If None, all benchmarks are returned.

None
display_on_leaderboard bool | None

If specified, filters benchmarks by whether they are displayed on the leaderboard.

None

Returns:

Type Description
list[Benchmark]

A list of Benchmark instances.

Source code in mteb/benchmarks/get_benchmark.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def get_benchmarks(
    names: list[str] | None = None, display_on_leaderboard: bool | None = None
) -> list[Benchmark]:
    """Get a list of benchmarks by name.

    Args:
        names: A list of benchmark names to retrieve. If None, all benchmarks are returned.
        display_on_leaderboard: If specified, filters benchmarks by whether they are displayed on the leaderboard.

    Returns:
        A list of Benchmark instances.
    """
    benchmark_registry = _build_registry()

    if names is None:
        names = list(benchmark_registry.keys())
    benchmarks = [get_benchmark(name) for name in names]
    if display_on_leaderboard is not None:
        benchmarks = [
            b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard
        ]
    return benchmarks

mteb.get_benchmark(benchmark_name)

Get a benchmark by name.

Parameters:

Name Type Description Default
benchmark_name str

The name of the benchmark to retrieve.

required

Returns:

Type Description
Benchmark

The Benchmark instance corresponding to the given name.

Source code in mteb/benchmarks/get_benchmark.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def get_benchmark(
    benchmark_name: str,
) -> Benchmark:
    """Get a benchmark by name.

    Args:
        benchmark_name: The name of the benchmark to retrieve.

    Returns:
        The Benchmark instance corresponding to the given name.
    """
    benchmark_registry = _build_registry()
    aliases_registry = _build_aliases_registry()

    if benchmark_name in aliases_registry:
        return aliases_registry[benchmark_name]
    if benchmark_name not in benchmark_registry:
        close_matches = difflib.get_close_matches(
            benchmark_name, benchmark_registry.keys()
        )
        if close_matches:
            suggestion = (
                f"'{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
            )
        else:
            suggestion = f"'{benchmark_name}' not found and no similar keys were found."
        raise KeyError(suggestion)
    return benchmark_registry[benchmark_name]

The Benchmark Object

mteb.Benchmark dataclass

A benchmark object intended to run a certain benchmark within MTEB.

Parameters:

Name Type Description Default
name str

The name of the benchmark

required
aliases Sequence[str]

Alternative names for the benchmark

tuple()
tasks Sequence[AbsTask]

The tasks within the benchmark.

required
description str | None

A description of the benchmark, should include its intended goal and potentially a description of its construction

None
reference StrURL | None

A link reference, to a source containing additional information typically to a paper, leaderboard or github.

None
citation str | None

A bibtex citation

None
contacts list[str] | None

The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

None
superseded_by Sequence[str] | None

Benchmark name with newer version of benchmark

None
aggregations Sequence[BenchmarkAggregation]

Which aggregations to use in on leaderboard

(MEAN_TASK, MEAN_TASK_TYPE, TASK_TYPES)
summary_sort_column

The column to sort benchmarks by on leaderboard

required

Examples:

>>> Benchmark(
...     name="MTEB(custom)",
...     tasks=mteb.get_tasks(
...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
...         languages=["eng"],
...     ),
...     description="A custom benchmark"
... )
Source code in mteb/benchmarks/benchmark.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
@dataclass
class Benchmark:
    """A benchmark object intended to run a certain benchmark within MTEB.

    Args:
        name: The name of the benchmark
        aliases: Alternative names for the benchmark
        tasks: The tasks within the benchmark.
        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
        citation: A bibtex citation
        contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle.
        superseded_by: Benchmark name with newer version of benchmark
        aggregations: Which aggregations to use in on leaderboard
        summary_sort_column: The column to sort benchmarks by on leaderboard

    Examples:
        >>> Benchmark(
        ...     name="MTEB(custom)",
        ...     tasks=mteb.get_tasks(
        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
        ...         languages=["eng"],
        ...     ),
        ...     description="A custom benchmark"
        ... )
    """

    name: str
    tasks: Sequence[AbsTask]
    aliases: Sequence[str] = field(default_factory=tuple)
    description: str | None = None
    reference: StrURL | None = None
    citation: str | None = None
    contacts: list[str] | None = None
    icon: str | None = None
    display_name: str | None = None
    language_view: list[str] | Literal["all"] = field(default_factory=list)
    benchmark_hf_repo: str | None = None
    superseded_by: Sequence[str] | None = None
    # Api aggregation functions
    aggregations: Sequence[BenchmarkAggregation] = (
        BenchmarkAggregation.MEAN_TASK,
        BenchmarkAggregation.MEAN_TASK_TYPE,
        BenchmarkAggregation.TASK_TYPES,
    )
    # Whether the leaderboard summary table surfaces the Zero-shot column.
    # Off for benchmarks where model training-data annotations don't cover
    # the task set (e.g. ViDoRe), so every row would otherwise render as a
    # misleading 100%. The API echoes this on ``BenchmarkSummarySchema`` and
    # the frontend hides the column when False.
    show_zero_shot: bool = True
    # Sort column(s) for the leaderboard summary. ``None`` keeps the default
    # ``Rank (Borda)`` sort; a string or tuple of strings sorts by those
    # columns descending and adds a 1-indexed ``summary_rank_column`` rank.
    summary_sort_column: ClassVar[str | Sequence[str] | None] = None
    # Name of the 1-indexed rank column added when ``summary_sort_column`` is
    # set. ``None`` falls back to ``"Rank"`` (Borda stays as a trailing col).
    summary_rank_column: ClassVar[str | None] = None

    @property
    def display_on_leaderboard(self) -> bool:
        """Whether the benchmark should be displayed on the leaderboard."""
        benchmarks_on_leaderboard = _get_benchmarks_on_leaderboard()
        return self.name in benchmarks_on_leaderboard

    def __iter__(self) -> Iterator[AbsTask]:
        return iter(self.tasks)

    def __len__(self) -> int:
        return len(self.tasks)

    def __getitem__(self, index: int) -> AbsTask:
        return self.tasks[index]

    def _build_per_task_pivot(  # noqa: PLR6301
        self, pl_df: pl.DataFrame
    ) -> tuple[pl.DataFrame, list[str]] | None:
        """Compute the standard (model × task) wide pivot once.

        Callers building both summary and per-task tables from the same long
        frame can pass the result to both via the ``pivot`` kwarg to halve
        polars CPU on the pivot step. Subclasses whose summary builder needs
        an is_public-aware pivot still benefit because their per-task table
        builder reuses this one. ``None`` when the input frame is empty.
        """
        from mteb.benchmarks._create_table import _build_per_task_pivot

        return _build_per_task_pivot(pl_df)

    def _create_summary_table(self, pl_df: pl.DataFrame) -> SummaryTable:
        """Create summary table from a long polars pre-agg frame.

        Thin wrapper around
        [_create_summary_table][mteb.benchmarks._create_table._create_summary_table]
        that forwards
        [aggregations][mteb.benchmarks.benchmark.Benchmark.aggregations],
        [summary_sort_column][mteb.benchmarks.benchmark.Benchmark.summary_sort_column],
        and
        [summary_rank_column][mteb.benchmarks.benchmark.Benchmark.summary_rank_column].
        Called by the leaderboard app.
        """
        from mteb.benchmarks._create_table import _create_summary_table

        return _create_summary_table(
            pl_df,
            aggregations=self.aggregations,
            sort_by=self.summary_sort_column,
            rank_column_name=self.summary_rank_column,
        )

    def _create_per_task_table(  # noqa: PLR6301
        self,
        pl_df: pl.DataFrame,
        *,
        pivot: tuple[pl.DataFrame, list[str]] | None = None,
    ) -> pl.DataFrame:
        """Create per-task table from a long polars pre-agg frame. Called by the leaderboard app."""
        from mteb.benchmarks._create_table import (
            _create_per_task_table_from_benchmark_results,
        )

        return _create_per_task_table_from_benchmark_results(pl_df, pivot=pivot)

    def _create_per_language_table(self, pl_df: pl.DataFrame) -> pl.DataFrame:
        """Create per-language table from a long polars pre-agg frame. Called by the leaderboard app."""
        from mteb.benchmarks._create_table import (
            _create_per_language_table_from_benchmark_results,
        )

        if self.language_view == "all" or len(self.language_view) > 0:
            return _create_per_language_table_from_benchmark_results(
                pl_df, self.language_view
            )
        return pl.DataFrame(
            {
                "No results": [
                    "The per-language table is not available for this benchmark."
                ]
            }
        )

    def push_collection_to_hub(
        self,
        hf_username: str,
        collection_name: str | None = None,
    ) -> None:
        """Push the benchmark collection to Hugging Face Hub.

        Args:
            hf_username: Hugging Face username or organization name
            collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
        """
        collections = huggingface_hub.list_collections(owner=hf_username)
        collection_name = collection_name or self.name
        existing_collection = None
        for collection in collections:
            if collection.title == collection_name:
                existing_collection = collection
                break

        if existing_collection is None:
            description = self.description
            if description and len(description) > 150:
                description = description[:147] + "..."
            collection = huggingface_hub.create_collection(
                title=collection_name,
                namespace=hf_username,
                # hf collections have a 150 character limit for description, so we truncate it if it's too long
                description=description if description else None,
            )
        else:
            # list collections would output only 4 items
            collection = huggingface_hub.get_collection(
                collection_slug=existing_collection.slug
            )

        existing_items = {item.item_id for item in collection.items}

        for task in self.tasks:
            tasks = (
                cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
            )
            for benchmark_task in tasks:
                task_path = benchmark_task.metadata.dataset["path"]
                if task_path in existing_items:
                    continue
                huggingface_hub.add_collection_item(
                    collection_slug=collection.slug,
                    item_id=task_path,
                    item_type="dataset",
                )
                existing_items.add(task_path)

    def __repr__(self) -> str:
        n_tasks = len(self.tasks)
        max_len = 50
        desc = self.description if self.description else ""
        desc = f"'{desc[:max_len]}..." if len(desc) > max_len else f"'{desc}'"
        return f"{self.__class__.__name__}(name='{self.name}', description={desc}, tasks=[...] (#{n_tasks}), ...)"

    def _generate_benchmark_card(self) -> DatasetCard:
        """Generate a README/dataset card for this benchmark."""
        template_path = Path(__file__).parent / "benchmark_card_template.md"

        task_rows = [
            {
                "name": task.metadata.name,
                "reference": task.metadata.reference,
                "simplified_type": task.metadata.simplified_task_type,
                "description": task.metadata.description or "",
            }
            for task in self.tasks
        ]

        return cast(
            "DatasetCard",
            DatasetCard.from_template(
                card_data=DatasetCardData(tags=["mteb", "benchmark"]),
                template_path=str(template_path),
                benchmark_name=self.name,
                benchmark_description=self.description,
                tasks=task_rows,
                citation=self.citation,
            ),
        )

    def push_benchmark_card_to_hub(
        self,
        *,
        create_pr: bool = False,
    ) -> None:
        """Push a README benchmark card to the HuggingFace Hub dataset repo."""
        if self.benchmark_hf_repo is None:
            raise ValueError(
                "`benchmark_hf_repo` must be set to push a benchmark card to the hub."
            )

        if not huggingface_hub.repo_exists(self.benchmark_hf_repo, repo_type="dataset"):
            huggingface_hub.create_repo(
                self.benchmark_hf_repo,
                repo_type="dataset",
            )

        card = self._generate_benchmark_card()
        card.push_to_hub(
            self.benchmark_hf_repo,
            repo_type="dataset",
            commit_message="Add benchmark card",
            create_pr=create_pr,
        )

    def push_eval_to_hub(
        self,
        *,
        create_pr: bool = False,
    ) -> None:
        """Push `eval.yaml` to the HuggingFace Hub

        Args:
            create_pr: Whether to create the PR
        """
        eval_file_name = "eval.yaml"

        if self.benchmark_hf_repo is None:
            raise ValueError(
                "`benchmark_hf_repo` must be set to push eval config to the hub."
            )

        existing_eval_path = _get_file_on_hub(
            repo_id=self.benchmark_hf_repo,
            file_name=eval_file_name,
            repo_type="dataset",
        )

        # handle multiple tasks in one repo (e.g. BRIGHT)
        existing_eval = None
        if existing_eval_path is not None:
            with Path(existing_eval_path).open(encoding="utf-8") as f:
                existing_eval_dict = yaml.safe_load(f)
            if existing_eval_dict is not None:
                existing_eval = HFEvalMeta.model_validate(existing_eval_dict)

        benchmark_config = self._to_hf_eval_config()
        benchmark_config = (
            benchmark_config.merge(existing_eval) if existing_eval else benchmark_config
        )

        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as tmp_file:
            tmp_file.write(benchmark_config.to_yaml())
            tmp_file.flush()

            huggingface_hub.upload_file(
                path_or_fileobj=tmp_file.name,
                path_in_repo=eval_file_name,
                repo_id=self.benchmark_hf_repo,
                repo_type="dataset",
                commit_message="Add eval config",
                create_pr=create_pr,
            )

    def _to_hf_eval_config(self) -> HFEvalMeta:
        return HFEvalMeta(
            name=self.name,
            description=self.description,
            tasks=[
                HFEvalTaskConfig(
                    id=self.name,
                    config=None,
                    split=None,
                )
            ],
        )

    def _get_model_score(
        self,
        model_result: ModelResult,
    ) -> dict[str, float | None]:
        """Compute aggregated scores for a single model.

        Drives the per-aggregation compute via
        [BenchmarkAggregation.aggregate][mteb.benchmarks.benchmark.BenchmarkAggregation.aggregate],
        so the keys returned match what
        [_create_summary_table][mteb.benchmarks.benchmark.Benchmark._create_summary_table]
        would surface for the same `self.aggregations` set.

        Args:
            model_result: The model whose results to aggregate.

        Returns:
            dict: Score keys produced by `self.aggregations` mapped to their
                values. Keys include `"Mean(Task)"`, `"Mean(TaskType)"`,
                per-type means, `"Mean(Public)"`/`"Mean(Private)"`, and
                `"Mean(Subset)"` depending on the aggregation set.

        Raises:
            ValueError: If the model is missing results for some benchmark tasks.
        """
        filtered = model_result.select_tasks(self.tasks).task_results
        if len(filtered) < len(self.tasks):
            raise ValueError(
                "Some scores of benchmark are missing. Please, run model on full benchmark tasks"
            )

        scores: dict[str, float | None] = {}
        for aggregation in self.aggregations:
            scores.update(aggregation.aggregate(filtered))
        return scores

    def get_score(
        self,
        results: BenchmarkResults,
        *,
        raise_error: bool = False,
    ) -> dict[str, dict[str, float | None]]:
        """Get aggregated scores for all models in *results*.

        The benchmark class controls how scores are aggregated — subclasses may
        override this method to customise the returned metrics.

        Args:
            results: A `BenchmarkResults` object containing the model
                results to score.
            raise_error: Weather to raise an error on missing results.

        Returns:
            A dict mapping each model name to a dict whose keys are
            determined by
            [aggregations][mteb.benchmarks.benchmark.Benchmark.aggregations].
            Possible keys include:

            - `"Mean(Task)"`: mean score across all benchmark tasks (when
                [MEAN_TASK][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK]
                is enabled).
            - `"Mean(TaskType)"`: mean of per-task-type means (when
                [MEAN_TASK_TYPE][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK_TYPE]
                is enabled).
            - per-task-type means keyed by raw type name (e.g. `"Retrieval"`)
                when
                [TASK_TYPES][mteb.benchmarks.benchmark.BenchmarkAggregation.TASK_TYPES]
                is enabled.
            - `"Mean(Public)"` / `"Mean(Private)"` when
                [PUBLIC_PRIVATE][mteb.benchmarks.benchmark.BenchmarkAggregation.PUBLIC_PRIVATE]
                is enabled.
            - `"Rank"`: Borda count rank (1 = best). Each model earns
                `n - rank` points per task; points are summed and the model
                with the highest total is ranked 1. Matches the leaderboard.
                Always present.
        """
        from mteb.benchmarks._create_table import _get_borda_rank

        bench_results = results.join_revisions()
        scores: dict[str, dict[str, float | None]] = {}
        per_task_rows: dict[str, dict[str, float | None]] = {}

        for model_result in bench_results:
            per_task_rows[model_result.model_name] = {}
            filtered = model_result.select_tasks(self.tasks).task_results
            try:
                scores[model_result.model_name] = self._get_model_score(model_result)
            except ValueError:
                if raise_error:
                    raise
                logger.warning(
                    "Some task results are missing. Filling results with None"
                )
                scores[model_result.model_name] = {
                    t.metadata.name: None for t in self.tasks
                }
                continue

            per_task_rows[model_result.model_name] = {
                tr.task_name: tr.get_score() for tr in filtered
            }

        if per_task_rows:
            per_task_df = pd.DataFrame.from_dict(per_task_rows, orient="index").reindex(
                list(per_task_rows.keys())
            )
            if per_task_df.shape[1] > 0:
                per_task_pl = pl.from_pandas(
                    per_task_df.reset_index(names="model_name")
                )
                task_cols = list(per_task_df.columns)
                borda_list = (
                    per_task_pl.select(_get_borda_rank(task_cols)).to_series().to_list()
                )
                for name, rank in zip(per_task_df.index, borda_list):
                    scores[name]["Rank"] = int(rank)
            else:
                for name, model_scores in scores.items():
                    model_scores["Rank"] = None
        else:
            for name, model_scores in scores.items():
                model_scores["Rank"] = None

        return scores

display_on_leaderboard property

Whether the benchmark should be displayed on the leaderboard.

get_score(results, *, raise_error=False)

Get aggregated scores for all models in results.

The benchmark class controls how scores are aggregated — subclasses may override this method to customise the returned metrics.

Parameters:

Name Type Description Default
results BenchmarkResults

A BenchmarkResults object containing the model results to score.

required
raise_error bool

Weather to raise an error on missing results.

False

Returns:

Type Description
dict[str, dict[str, float | None]]

A dict mapping each model name to a dict whose keys are

dict[str, dict[str, float | None]]

determined by

dict[str, dict[str, float | None]]

[aggregations][mteb.benchmarks.benchmark.Benchmark.aggregations].

dict[str, dict[str, float | None]]

Possible keys include:

dict[str, dict[str, float | None]]
  • "Mean(Task)": mean score across all benchmark tasks (when [MEAN_TASK][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK] is enabled).
dict[str, dict[str, float | None]]
  • "Mean(TaskType)": mean of per-task-type means (when [MEAN_TASK_TYPE][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK_TYPE] is enabled).
dict[str, dict[str, float | None]]
  • per-task-type means keyed by raw type name (e.g. "Retrieval") when [TASK_TYPES][mteb.benchmarks.benchmark.BenchmarkAggregation.TASK_TYPES] is enabled.
dict[str, dict[str, float | None]]
  • "Mean(Public)" / "Mean(Private)" when [PUBLIC_PRIVATE][mteb.benchmarks.benchmark.BenchmarkAggregation.PUBLIC_PRIVATE] is enabled.
dict[str, dict[str, float | None]]
  • "Rank": Borda count rank (1 = best). Each model earns n - rank points per task; points are summed and the model with the highest total is ranked 1. Matches the leaderboard. Always present.
Source code in mteb/benchmarks/benchmark.py
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
def get_score(
    self,
    results: BenchmarkResults,
    *,
    raise_error: bool = False,
) -> dict[str, dict[str, float | None]]:
    """Get aggregated scores for all models in *results*.

    The benchmark class controls how scores are aggregated — subclasses may
    override this method to customise the returned metrics.

    Args:
        results: A `BenchmarkResults` object containing the model
            results to score.
        raise_error: Weather to raise an error on missing results.

    Returns:
        A dict mapping each model name to a dict whose keys are
        determined by
        [aggregations][mteb.benchmarks.benchmark.Benchmark.aggregations].
        Possible keys include:

        - `"Mean(Task)"`: mean score across all benchmark tasks (when
            [MEAN_TASK][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK]
            is enabled).
        - `"Mean(TaskType)"`: mean of per-task-type means (when
            [MEAN_TASK_TYPE][mteb.benchmarks.benchmark.BenchmarkAggregation.MEAN_TASK_TYPE]
            is enabled).
        - per-task-type means keyed by raw type name (e.g. `"Retrieval"`)
            when
            [TASK_TYPES][mteb.benchmarks.benchmark.BenchmarkAggregation.TASK_TYPES]
            is enabled.
        - `"Mean(Public)"` / `"Mean(Private)"` when
            [PUBLIC_PRIVATE][mteb.benchmarks.benchmark.BenchmarkAggregation.PUBLIC_PRIVATE]
            is enabled.
        - `"Rank"`: Borda count rank (1 = best). Each model earns
            `n - rank` points per task; points are summed and the model
            with the highest total is ranked 1. Matches the leaderboard.
            Always present.
    """
    from mteb.benchmarks._create_table import _get_borda_rank

    bench_results = results.join_revisions()
    scores: dict[str, dict[str, float | None]] = {}
    per_task_rows: dict[str, dict[str, float | None]] = {}

    for model_result in bench_results:
        per_task_rows[model_result.model_name] = {}
        filtered = model_result.select_tasks(self.tasks).task_results
        try:
            scores[model_result.model_name] = self._get_model_score(model_result)
        except ValueError:
            if raise_error:
                raise
            logger.warning(
                "Some task results are missing. Filling results with None"
            )
            scores[model_result.model_name] = {
                t.metadata.name: None for t in self.tasks
            }
            continue

        per_task_rows[model_result.model_name] = {
            tr.task_name: tr.get_score() for tr in filtered
        }

    if per_task_rows:
        per_task_df = pd.DataFrame.from_dict(per_task_rows, orient="index").reindex(
            list(per_task_rows.keys())
        )
        if per_task_df.shape[1] > 0:
            per_task_pl = pl.from_pandas(
                per_task_df.reset_index(names="model_name")
            )
            task_cols = list(per_task_df.columns)
            borda_list = (
                per_task_pl.select(_get_borda_rank(task_cols)).to_series().to_list()
            )
            for name, rank in zip(per_task_df.index, borda_list):
                scores[name]["Rank"] = int(rank)
        else:
            for name, model_scores in scores.items():
                model_scores["Rank"] = None
    else:
        for name, model_scores in scores.items():
            model_scores["Rank"] = None

    return scores

push_benchmark_card_to_hub(*, create_pr=False)

Push a README benchmark card to the HuggingFace Hub dataset repo.

Source code in mteb/benchmarks/benchmark.py
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
def push_benchmark_card_to_hub(
    self,
    *,
    create_pr: bool = False,
) -> None:
    """Push a README benchmark card to the HuggingFace Hub dataset repo."""
    if self.benchmark_hf_repo is None:
        raise ValueError(
            "`benchmark_hf_repo` must be set to push a benchmark card to the hub."
        )

    if not huggingface_hub.repo_exists(self.benchmark_hf_repo, repo_type="dataset"):
        huggingface_hub.create_repo(
            self.benchmark_hf_repo,
            repo_type="dataset",
        )

    card = self._generate_benchmark_card()
    card.push_to_hub(
        self.benchmark_hf_repo,
        repo_type="dataset",
        commit_message="Add benchmark card",
        create_pr=create_pr,
    )

push_collection_to_hub(hf_username, collection_name=None)

Push the benchmark collection to Hugging Face Hub.

Parameters:

Name Type Description Default
hf_username str

Hugging Face username or organization name

required
collection_name str | None

Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.

None
Source code in mteb/benchmarks/benchmark.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def push_collection_to_hub(
    self,
    hf_username: str,
    collection_name: str | None = None,
) -> None:
    """Push the benchmark collection to Hugging Face Hub.

    Args:
        hf_username: Hugging Face username or organization name
        collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
    """
    collections = huggingface_hub.list_collections(owner=hf_username)
    collection_name = collection_name or self.name
    existing_collection = None
    for collection in collections:
        if collection.title == collection_name:
            existing_collection = collection
            break

    if existing_collection is None:
        description = self.description
        if description and len(description) > 150:
            description = description[:147] + "..."
        collection = huggingface_hub.create_collection(
            title=collection_name,
            namespace=hf_username,
            # hf collections have a 150 character limit for description, so we truncate it if it's too long
            description=description if description else None,
        )
    else:
        # list collections would output only 4 items
        collection = huggingface_hub.get_collection(
            collection_slug=existing_collection.slug
        )

    existing_items = {item.item_id for item in collection.items}

    for task in self.tasks:
        tasks = (
            cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
        )
        for benchmark_task in tasks:
            task_path = benchmark_task.metadata.dataset["path"]
            if task_path in existing_items:
                continue
            huggingface_hub.add_collection_item(
                collection_slug=collection.slug,
                item_id=task_path,
                item_type="dataset",
            )
            existing_items.add(task_path)

push_eval_to_hub(*, create_pr=False)

Push eval.yaml to the HuggingFace Hub

Parameters:

Name Type Description Default
create_pr bool

Whether to create the PR

False
Source code in mteb/benchmarks/benchmark.py
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
def push_eval_to_hub(
    self,
    *,
    create_pr: bool = False,
) -> None:
    """Push `eval.yaml` to the HuggingFace Hub

    Args:
        create_pr: Whether to create the PR
    """
    eval_file_name = "eval.yaml"

    if self.benchmark_hf_repo is None:
        raise ValueError(
            "`benchmark_hf_repo` must be set to push eval config to the hub."
        )

    existing_eval_path = _get_file_on_hub(
        repo_id=self.benchmark_hf_repo,
        file_name=eval_file_name,
        repo_type="dataset",
    )

    # handle multiple tasks in one repo (e.g. BRIGHT)
    existing_eval = None
    if existing_eval_path is not None:
        with Path(existing_eval_path).open(encoding="utf-8") as f:
            existing_eval_dict = yaml.safe_load(f)
        if existing_eval_dict is not None:
            existing_eval = HFEvalMeta.model_validate(existing_eval_dict)

    benchmark_config = self._to_hf_eval_config()
    benchmark_config = (
        benchmark_config.merge(existing_eval) if existing_eval else benchmark_config
    )

    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as tmp_file:
        tmp_file.write(benchmark_config.to_yaml())
        tmp_file.flush()

        huggingface_hub.upload_file(
            path_or_fileobj=tmp_file.name,
            path_in_repo=eval_file_name,
            repo_id=self.benchmark_hf_repo,
            repo_type="dataset",
            commit_message="Add eval config",
            create_pr=create_pr,
        )