Source code for pyhealth.datasets.clinvar

"""ClinVar dataset for PyHealth.

This module provides the ClinVarDataset class for loading and processing
ClinVar variant data for machine learning tasks.
"""

import logging
import os
from pathlib import Path
from typing import List, Optional

import pandas as pd

from .base_dataset import BaseDataset

logger = logging.getLogger(__name__)


[docs]class ClinVarDataset(BaseDataset): """ClinVar dataset for variant classification. ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. This dataset enables variant pathogenicity prediction tasks. Dataset is available at: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/ Args: root: Root directory of the raw data containing the ClinVar files. tables: Optional list of additional tables to load beyond defaults. dataset_name: Optional name of the dataset. Defaults to "clinvar". config_path: Optional path to the configuration file. If not provided, uses the default config in the configs directory. Attributes: root: Root directory of the raw data. dataset_name: Name of the dataset. config_path: Path to the configuration file. Examples: >>> from pyhealth.datasets import ClinVarDataset >>> dataset = ClinVarDataset(root="/path/to/clinvar") >>> dataset.stats() >>> samples = dataset.set_task() >>> print(samples[0]) """ def __init__( self, root: str, tables: List[str] = None, dataset_name: Optional[str] = None, config_path: Optional[str] = None, **kwargs, ) -> None: if config_path is None: logger.info("No config path provided, using default config") config_path = Path(__file__).parent / "configs" / "clinvar.yaml" # Prepare standardized CSV if not exists pyhealth_csv = os.path.join(root, "clinvar-pyhealth.csv") if not os.path.exists(pyhealth_csv): logger.info("Preparing ClinVar metadata...") self.prepare_metadata(root) default_tables = ["variants"] tables = default_tables + (tables or []) super().__init__( root=root, tables=tables, dataset_name=dataset_name or "clinvar", config_path=config_path, **kwargs, )
[docs] @staticmethod def prepare_metadata(root: str) -> None: """Prepare metadata for the ClinVar dataset. Converts raw ClinVar variant_summary.txt to standardized CSV format. Args: root: Root directory containing the ClinVar files. """ # Try to find the raw ClinVar file possible_files = [ "variant_summary.txt", "variant_summary.txt.gz", "clinvar_variant_summary.txt", "clinvar.vcf", ] raw_file = None for fname in possible_files: fpath = os.path.join(root, fname) if os.path.exists(fpath): raw_file = fpath break if raw_file is None: logger.warning( f"No raw ClinVar file found in {root}. " "Please download from https://ftp.ncbi.nlm.nih.gov/pub/clinvar/ " "and place variant_summary.txt in the root directory." ) # Create empty placeholder pd.DataFrame( columns=[ "gene_symbol", "clinical_significance", "review_status", "chromosome", "position", "reference_allele", "alternate_allele", "variant_type", "assembly", ] ).to_csv(os.path.join(root, "clinvar-pyhealth.csv"), index=False) return logger.info(f"Processing ClinVar file: {raw_file}") # Read the raw file if raw_file.endswith(".gz"): df = pd.read_csv(raw_file, sep="\t", compression="gzip", low_memory=False) else: df = pd.read_csv(raw_file, sep="\t", low_memory=False) # Standardize column names column_mapping = { "GeneSymbol": "gene_symbol", "ClinicalSignificance": "clinical_significance", "ReviewStatus": "review_status", "Chromosome": "chromosome", "PositionVCF": "position", "ReferenceAlleleVCF": "reference_allele", "AlternateAlleleVCF": "alternate_allele", "Type": "variant_type", "Assembly": "assembly", } # Select and rename columns that exist available_cols = [c for c in column_mapping.keys() if c in df.columns] df_out = df[available_cols].rename( columns={k: v for k, v in column_mapping.items() if k in available_cols} ) # Filter for GRCh38 assembly if assembly column exists if "assembly" in df_out.columns: df_out = df_out[df_out["assembly"] == "GRCh38"] # Save to standardized CSV output_path = os.path.join(root, "clinvar-pyhealth.csv") df_out.to_csv(output_path, index=False) logger.info(f"Saved {len(df_out)} variants to {output_path}")
@property def default_task(self): """Returns the default task for this dataset. Returns: VariantClassificationClinVar: The default classification task. """ from pyhealth.tasks import VariantClassificationClinVar return VariantClassificationClinVar()