# 1. 为 MMEval 添加一个评测指标 

在 MMEval 中实现一个自定义评测指标，需要继承 BaseMetric 并且实现 add 和 compute_metric 方法。

在评测过程中，评测指标需要在调用 add 后更新 _results 以存储中间结果。在最后进行指标计算的时候，将会对 _results 进行进程同步后调用 compute_metric 进行指标的计算。

以实现 Accuracy 指标为例：

In [None]:
import numpy as np
from mmeval.core import BaseMetric

class Accuracy(BaseMetric):

    def add(self, predictions, labels):
        for prediction, label in zip(predictions, labels):
            self._results.append((prediction, label))

    def compute_metric(self, results):
        predictions = np.concatenate(
            [res[0] for res in results])
        labels = np.concatenate(
            [res[1] for res in results])
        correct = (predictions == labels)
        accuracy = sum(correct) / len(predictions)
        return {'accuracy': accuracy}

## 2. 从原 OpenMMLab 2.0 算法库中迁移评测指标至 MMEval

虽然 MMEval 目前已经发布了，但是仍然有一大部分原 OpenMMLab 算法库中的评测指标尚未迁移添加到 MMEval 中，我们整理了这些尚未添加的评测指标，发布了社区任务，欢迎各位小伙伴一起来提 PR 参与建设 MMEval：https://github.com/open-mmlab/mmeval/issues/50

详细的注意事项可参考：[MMEval 适配指南](https://aicarrier.feishu.cn/docs/doccnjJiEdOYTdse9zH9Y2Bt8De)

下面以 mmseg 中的 [IoUMetric](https://github.com/open-mmlab/mmsegmentation/blob/a06bf4d66349f19d2384bfdb5085d2d0f6ee98e0/mmseg/evaluation/metrics/iou_metric.py#L15) 为例，为大家展示如何将一个 OpenMMLab 2.0 算法库中的评测指标迁移至 MMEval，并在原算法库中适配使用 MMEval。

## 2.1 为 MMEval 添加 MeanIoU

见：[mmeval/metrics/mean_iou.py](https://github.com/open-mmlab/mmeval/blob/e2f6dbf160d41adfa185caed211f2ac50747f3ba/mmeval/metrics/mean_iou.py#L21)

In [None]:
# Copyright (c) OpenMMLab. All rights reserved.

import numpy as np
from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, overload

from mmeval.core.base_metric import BaseMetric
from mmeval.core.dispatcher import dispatch
from mmeval.utils import try_import

if TYPE_CHECKING:
    import paddle
    import tensorflow
    import tensorflow as tf
    import torch
else:
    # 使用 try_import 
    paddle = try_import('paddle')
    torch = try_import('torch')
    tf = try_import('tensorflow')


# 类 docstring 规范，注意添加评测指标使用示例
class MeanIoU(BaseMetric):
    """MeanIoU evaluation metric.

    MeanIoU is a widely used evaluation metric for image semantic segmentation.

    In addition to mean iou, it will also compute and return accuracy, mean
    accuracy, mean dice, mean precision, mean recall and mean f-score.

    This metric supports 4 kinds of inputs, i.e. ``numpy.ndarray``,
    ``torch.Tensor``, ``tensorflow.Tensor`` and ``paddle.Tensor``, and the
    implementation for the calculation depends on the inputs type.

    Args:
        num_classes (int, optional): The number of classes. If None, it will be
            obtained from the 'num_classes' or 'classes' field in
            `self.dataset_meta`. Defaults to None.
        ignore_index (int, optional): Index that will be ignored in evaluation.
            Defaults to 255.
        nan_to_num (int, optional): If specified, NaN values will be replaced
            by the numbers defined by the user. Defaults to None.
        beta (int, optional): Determines the weight of recall in the F-score.
            Defaults to 1.
        classwise_results (bool, optional): Whether to return the computed
            results of each class. Defaults to False.
        **kwargs: Keyword arguments passed to :class:`BaseMetric`.

    Examples:

        >>> from mmeval import MeanIoU
        >>> miou = MeanIoU(num_classes=4)

    Use NumPy implementation:

        >>> import numpy as np
        >>> labels = np.asarray([[[0, 1, 1], [2, 3, 2]]])
        >>> preds = np.asarray([[[0, 2, 1], [1, 3, 2]]])
        >>> miou(preds, labels)
        {'aAcc': 0.6666666666666666,
         'mIoU': 0.6666666666666666,
         'mAcc': 0.75,
         'mDice': 0.75,
         'mPrecision': 0.75,
         'mRecall': 0.75,
         'mFscore': 0.75,
         'kappa': 0.5384615384615384}

    Use PyTorch implementation:

        >>> import torch
        >>> labels = torch.Tensor([[[0, 1, 1], [2, 3, 2]]])
        >>> preds = torch.Tensor([[[0, 2, 1], [1, 3, 2]]])
        >>> miou(preds, labels)
        {'aAcc': 0.6666666666666666,
         'mIoU': 0.6666666666666666,
         'mAcc': 0.75,
         'mDice': 0.75,
         'mPrecision': 0.75,
         'mRecall': 0.75,
         'mFscore': 0.75,
         'kappa': 0.5384615384615384}

    Accumulate batch:

        >>> for i in range(10):
        ...     labels = torch.randint(0, 4, size=(100, 10, 10))
        ...     predicts = torch.randint(0, 4, size=(100, 10, 10))
        ...     miou.add(predicts, labels)
        >>> miou.compute()  # doctest: +SKIP
    """

    def __init__(self,
                 num_classes: Optional[int] = None,
                 ignore_index: int = 255,
                 nan_to_num: Optional[int] = None,
                 beta: int = 1,
                 classwise_results: bool = False,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self._num_classes = num_classes
        self.ignore_index = ignore_index
        self.nan_to_num = nan_to_num
        self.beta = beta
        self.classwise_results = classwise_results

    # num_classes 有两种设置方式，初始化时候设置或者通过 dataset_meta 设置
    @property
    def num_classes(self) -> int:
        """Returns the number of classes.

        The number of classes should be set during initialization, otherwise it
        will be obtained from the 'classes' or 'num_classes' field in
        ``self.dataset_meta``.

        Raises:
            RuntimeError: If the num_classes is not set.

        Returns:
            int: The number of classes.
        """
        if self._num_classes is not None:
            return self._num_classes
        if self.dataset_meta and 'num_classes' in self.dataset_meta:
            self._num_classes = self.dataset_meta['num_classes']
        elif self.dataset_meta and 'classes' in self.dataset_meta:
            self._num_classes = len(self.dataset_meta['classes'])
        else:
            raise RuntimeError(
                'The `num_claases` is required, and not found in '
                f'dataset_meta: {self.dataset_meta}')
        return self._num_classes
    
    # 必须重写 add 方法，在 add 方法里面将中间变量保存到 self._results 
    # add 方法接受的参数应该尽量简单清晰
    def add(self, predictions: Sequence, labels: Sequence) -> None:  # type: ignore # yapf: disable # noqa: E501
        """Proces one batch of data and predictions.

        Calculate the following 3 stuff from the inputs and store them in
        ``self._results``:

        - num_tp_per_class: the number of true positive per-class.
        - num_gts_per_class: the number of ground truth per-class.
        - num_preds_per_class: the number of predicition per-class.

        Args:
            predictions (Sequence): A sequence of the predicted segmentation
                mask.
            labels (Sequence): A sequence of the segmentation mask labels.
        """
        for prediction, label in zip(predictions, labels):
            assert prediction.shape == label.shape, 'The shape of ' \
                '`prediction` and `label` should be the same, but got: ' \
                f'{prediction.shape} and {label.shape}'
            # We assert the prediction and label should be a segmentation mask.
            assert len(prediction.shape) == 2, 'The dimension of ' \
                f'`prediction` should be 2, but got shape: {prediction.shape}'
            # Store the intermediate result used to calculate IoU.
            confusion_matrix = self.compute_confusion_matrix(
                prediction, label, self.num_classes)
            num_tp_per_class = np.diag(confusion_matrix)
            num_gts_per_class = confusion_matrix.sum(1)
            num_preds_per_class = confusion_matrix.sum(0)
            self._results.append(
                (num_tp_per_class, num_gts_per_class, num_preds_per_class), )
    
    # 使用 dispatch 装饰，根据输入的数据类型进行分发，计算混淆矩阵。
    @overload  # type: ignore
    @dispatch
    def compute_confusion_matrix(self, prediction: np.ndarray,
                                 label: np.ndarray,
                                 num_classes: int) -> np.ndarray:
        """Compute confusion matrix with NumPy.

        Args:
            prediction (numpy.ndarray): The predicition.
            label (numpy.ndarray): The ground truth.
            num_classes (int): The number of classes.

        Returns:
            numpy.ndarray: The computed confusion matrix.
        """
        mask = (label != self.ignore_index)
        prediction, label = prediction[mask], label[mask]
        confusion_matrix_1d = np.bincount(
            num_classes * label + prediction, minlength=num_classes**2)
        confusion_matrix = confusion_matrix_1d.reshape(num_classes,
                                                       num_classes)
        return confusion_matrix

    @overload  # type: ignore
    @dispatch
    def compute_confusion_matrix(  # type: ignore
            self, prediction: 'torch.Tensor', label: 'torch.Tensor',
            num_classes: int) -> np.ndarray:
        """Compute confusion matrix with PyTorch.

        Args:
            prediction (torch.Tensor): The predicition.
            label (torch.Tensor): The ground truth.
            num_classes (int): The number of classes.

        Returns:
            numpy.ndarray: The computed confusion matrix.
        """
        mask = (label != self.ignore_index)
        prediction, label = prediction[mask], label[mask]
        confusion_matrix_1d = torch.bincount(
            num_classes * label + prediction, minlength=num_classes**2)
        confusion_matrix = confusion_matrix_1d.reshape(num_classes,
                                                       num_classes)
        return confusion_matrix.cpu().numpy()

    @overload
    @dispatch
    def compute_confusion_matrix(  # type: ignore
            self, prediction: 'paddle.Tensor', label: 'paddle.Tensor',
            num_classes: int) -> np.ndarray:
        """Compute confusion matrix with Paddle.

        Args:
            prediction (paddle.Tensor): The predicition.
            label (paddle.Tensor): The ground truth.
            num_classes (int): The number of classes.

        Returns:
            numpy.ndarray: The computed confusion matrix.
        """
        mask = (label != self.ignore_index)
        prediction, label = prediction[mask], label[mask]
        # NOTE: Since the `paddle.bincount` has bug on the CUDA device, we use
        # the `np.bincount` instead. Once the bug is fixed, we will use
        # `paddle.bincount`.
        # For more see at: https://github.com/PaddlePaddle/Paddle/issues/46978
        confusion_matrix_1d = np.bincount(
            num_classes * label + prediction, minlength=num_classes**2)
        confusion_matrix = confusion_matrix_1d.reshape(
            (num_classes, num_classes))
        return confusion_matrix

    @dispatch
    def compute_confusion_matrix(  # type: ignore
            self, prediction: 'tensorflow.Tensor', label: 'tensorflow.Tensor',
            num_classes: int) -> np.ndarray:
        """Compute confusion matrix with TensorFlow.

        Args:
            prediction (tensorflow.Tensor): The predicition.
            label (tensorflow.Tensor): The ground truth.
            num_classes (int): The number of classes.

        Returns:
            numpy.ndarray: The computed confusion matrix.
        """
        mask = (label != self.ignore_index)
        prediction, label = prediction[mask], label[mask]
        confusion_matrix_1d = tf.math.bincount(
            tf.cast(num_classes * label + prediction, tf.int32),
            minlength=num_classes**2)
        confusion_matrix = tf.reshape(confusion_matrix_1d,
                                      (num_classes, num_classes))
        return confusion_matrix.numpy()
    
    # 必须重写的方法，在 self._results 进程同步之后，进行指标的计算。
    # 需要返回字典
    def compute_metric(
        self,
        results: List[Tuple[np.ndarray, np.ndarray, np.ndarray]],
    ) -> dict:
        """Compute the MeanIoU metric.

        This method would be invoked in `BaseMetric.compute` after distributed
        synchronization.

        Args:
            results (List[tuple]): This list has already been synced across all
                ranks. This is a list of tuple, and each tuple has the
                following elements:

                - (List[numpy.ndarray]): Each element in the list is the number
                  of true positive per-class on a sample.
                - (List[numpy.ndarray]): Each element in the list is the number
                  of ground truth per-class on a sample.
                - (List[numpy.ndarray]): Each element in the list is the number
                  of predicition per-class on a sample.

        Returns:
            Dict: The computed metric, with following keys:

            - aAcc, the overall accuracy, namely pixel accuracy.
            - mIoU, the mean Intersection-Over-Union (IoU) for all classes.
            - mAcc, the mean accuracy for all classes, namely mean pixel
            accuracy.
            - mDice, the mean dice coefficient for all claases.
            - mPrecision, the mean precision for all classes.
            - mRecall, the mean recall for all classes.
            - mFscore, the mean f-score for all classes.
            - kappa, the Cohen's kappa coefficient.
            - classwise_result, the evaluate results of each classes.
            This would be returned if ``self.classwise_result`` is True.
        """
        # Gather the `num_tp_per_class` from batches results.
        num_tp_per_class: np.ndarray = sum(res[0] for res in results)
        # Gather the `num_gts_per_class` from batches results.
        num_gts_per_class: np.ndarray = sum(res[1] for res in results)
        # Gather the `num_preds_per_class` from batches results.
        num_preds_per_class: np.ndarray = sum(res[2] for res in results)

        # Computing overall accuracy.
        overall_acc = num_tp_per_class.sum() / num_gts_per_class.sum()

        # compute iou per class
        union = num_preds_per_class + num_gts_per_class - num_tp_per_class
        iou = num_tp_per_class / union

        # compute accuracy per class
        accuracy = num_tp_per_class / num_gts_per_class

        # compute dice per class
        dice = 2 * num_tp_per_class / (num_preds_per_class + num_gts_per_class)

        # compute precision, recall and f-score per class
        precision = num_tp_per_class / num_preds_per_class
        recall = num_tp_per_class / num_gts_per_class
        f_score = (1 + self.beta**2) * (precision * recall) / (
            (self.beta**2 * precision) + recall)

        # compute kappa coefficient
        po = num_tp_per_class.sum() / num_gts_per_class.sum()
        pe = (num_gts_per_class * num_preds_per_class).sum() / (
            num_gts_per_class.sum()**2)
        kappa = (po - pe) / (1 - pe)

        def _mean(values: np.ndarray):
            if self.nan_to_num is not None:
                values = np.nan_to_num(values, nan=self.nan_to_num)
            return np.nanmean(values)

        metric_results = {
            'aAcc': overall_acc,
            'mIoU': _mean(iou),
            'mAcc': _mean(accuracy),
            'mDice': _mean(dice),
            'mPrecision': _mean(precision),
            'mRecall': _mean(recall),
            'mFscore': _mean(f_score),
            'kappa': kappa,
        }

        # Add the class-wise metric results to the returned results.
        if self.classwise_results:
            metric_results['classwise_results'] = {
                'IoU': iou,
                'Acc': accuracy,
                'Dice': dice,
                'Precision': precision,
                'Recall': recall,
                'Fscore': f_score,
            }
        return metric_results

## 2.2 在 MMSegmentation 中使用 MeanIoU

见：[open-mmlab/mmsegmentation/pull/2003](https://github.com/open-mmlab/mmsegmentation/pull/2003)

In [None]:
# Copyright (c) OpenMMLab. All rights reserved.

import warnings
import numpy as np
from typing import Sequence
from mmengine.logging import print_log
from prettytable import PrettyTable
from mmeval.metrics import MeanIoU

from mmseg.registry import METRICS


# 继承 mmeval.MeanIoU
@METRICS.register_module()
class IoUMetric(MeanIoU):
    """A wrapper of ``mmeval.MeanIoU``.

    This wrapper implements the `process` method that parses predictions and 
    labels from inputs. This enables ``mmengine.Evaluator`` to handle the data
    flow of different tasks through a unified interface.

    In addition, this wrapper also implements the ``evaluate`` method that
    parses metric results and print pretty tabel of metrics per class.

    Args:
        dist_backend (str | None): The name of the distributed communication
            backend. Refer to :class:`mmeval.BaseMetric`.
            Defaults to 'torch_cuda'.
        **kwargs: Keyword parameters passed to :class:`mmeval.MeanIoU`.
    """

    def __init__(self, dist_backend='torch_cuda', **kwargs):
        iou_metrics = kwargs.pop('iou_metrics', None)
        if iou_metrics is not None:
            warnings.warn(
                'DeprecationWarning: The `iou_metrics` parameter of '
                '`IoUMetric` is deprecated, defaults return all metrics now!')
        collect_device = kwargs.pop('collect_device', None)

        if collect_device is not None:
            warnings.warn(
                'DeprecationWarning: The `collect_device` parameter of '
                '`IoUMetric` is deprecated, use `dist_backend` instead.')

        # Changes the default value of `classwise_results` to True.
        super().__init__(classwise_results=True,
                         dist_backend=dist_backend,
                         **kwargs)

    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
        """Process one batch of data and data_samples.

        Parse predictions and labels from ``data_samples`` and invoke
        ``self.add``.

        Args:
            data_batch (dict): A batch of data from the dataloader.
            data_samples (Sequence[dict]): A batch of outputs from the model.
        """
        predictions, labels = [], []
        for data_sample in data_samples:
            pred_label = data_sample['pred_sem_seg']['data'].squeeze()
            label = data_sample['gt_sem_seg']['data'].squeeze().to(pred_label)
            predictions.append(pred_label)
            labels.append(label)

        self.add(predictions, labels)

    def evaluate(self, *args, **kwargs):
        """Returns metric results and print pretty tabel of metrics per class.

        This method would be invoked by ``mmengine.Evaluator``.
        """
        metric_results = self.compute(*args, **kwargs)
        self.reset()

        classwise_results = metric_results['classwise_results']
        del metric_results['classwise_results']
        
        # Pretty table of the metric results per class.
        summary_table = PrettyTable()
        summary_table.add_column('Class', self.dataset_meta['classes'])
        for key, value in classwise_results.items():
            value = np.round(value * 100, 2)
            summary_table.add_column(key, value)

        print_log('per class results:', logger='current')
        print_log('\n' + summary_table.get_string(), logger='current')

        # Multiply value by 100 to convert to percentage and rounding. 
        evaluate_results = {
            k: round(v * 100, 2) for k, v in metric_results.items()}
        return evaluate_results