Add metric
You can create custom evaluation metrics by inheriting from BaseMetric
:
from karma.metrics.base_metric_abs import BaseMetricfrom karma.registries.metrics_registry import register_metric
@register_metric("medical_accuracy")class MedicalAccuracyMetric(BaseMetric): """Medical-specific accuracy metric with domain weighting."""
def __init__(self, medical_term_weight=1.5): self.medical_term_weight = medical_term_weight self.medical_terms = self._load_medical_terms()
def evaluate(self, predictions, references, **kwargs): """Evaluate with medical term weighting.""" total_score = 0 total_weight = 0
for pred, ref in zip(predictions, references): # Standard comparison is_correct = pred.lower().strip() == ref.lower().strip()
# Apply weighting for medical terms weight = self._get_weight(ref) total_weight += weight
if is_correct: total_score += weight
accuracy = total_score / total_weight if total_weight > 0 else 0.0
return { "medical_accuracy": accuracy, "total_examples": len(predictions), "total_weight": total_weight }
def _get_weight(self, text): """Get weight based on medical content.""" weight = 1.0 for term in self.medical_terms: if term in text.lower(): weight = self.medical_term_weight break return weight
def _load_medical_terms(self): """Load medical terminology.""" return ["diabetes", "hypertension", "surgery", "medication", "diagnosis", "treatment", "symptom", "therapy"]
Using Custom Metrics
Section titled “Using Custom Metrics”Once registered, custom metrics are automatically discovered and need to be specified on the dataset that you want to use.
Let’s say you would like to change the openlifescienceai/pubmedqa
Update the @register_dataset in eval_datasets/pubmedqa.py
@register_dataset( DATASET_NAME, commit_hash=COMMIT_HASH, split=SPLIT, metrics=["exact_match", "medical_accuracy"], # we added the medical accuracy metric to this dataset task_type="mcqa",)class PubMedMCQADataset(MedQADataset):...
# The metric will be automatically used if specified in dataset registrationkarma eval --model qwen --model-path "Qwen/Qwen3-0.6B" \ --datasets my_medical_dataset