Source code for scanpex.sq._ensembl_mapper

from typing import List

import pandas as pd



[docs]
def ensembl_mapper(
    ensembl_list: List[str],
    species: str = "human",
) -> pd.DataFrame:
    """Map a list of Ensembl gene IDs to their corresponding gene symbols.

    This function queries the MyGene.info API to translate Ensembl IDs into
    gene symbols. If a gene symbol cannot be found for a given ID, the original
    Ensembl ID is retained in the 'symbol' column. Duplicate queries are dropped,
    keeping only the first matched record.

    Parameters
    ----------
    ensembl_list : List[str]
        A list of Ensembl gene IDs to be mapped (e.g., ['ENSG00000139618']).
    species : str, optional
        The species name or taxonomy ID to restrict the query. Common values
        include 'human' or 'mouse'. Default is 'human'.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame containing two columns:
        - 'query': The original Ensembl ID.
        - 'symbol': The mapped gene symbol (or the original ID if unmapped).

    Raises
    ------
    ImportError
        If the `mygene` package is not installed in the current environment.

    Examples
    --------
    >>> ensembl_ids = ["ENSG00000139618", "ENSG00000157764", "INVALID_ID"]
    >>> df = ensembl_mapper(ensembl_ids, species="human")
    >>> print(df)
                 query      symbol
    0  ENSG00000139618       BRCA2
    1  ENSG00000157764        BRAF
    2       INVALID_ID  INVALID_ID
    """
    try:
        import mygene
    except ImportError:
        raise ImportError(
            "mygene is not installed. Please install it using `pip install mygene`."
        )

    mg = mygene.MyGeneInfo()
    gene_symbols = mg.querymany(
        ensembl_list,
        scopes="ensembl.gene",
        fields="symbol",
        species=species,
        as_dataframe=True,
    ).reset_index()[["query", "symbol"]]
    gene_symbols["symbol"] = gene_symbols["symbol"].fillna(gene_symbols["query"])
    return gene_symbols.drop_duplicates(subset=["query"], keep="first")