Source code for scanpex.sq._ensembl_mapper

from typing import List

import pandas as pd


[docs] def ensembl_mapper( ensembl_list: List[str], species: str = "human", ) -> pd.DataFrame: """Map a list of Ensembl gene IDs to their corresponding gene symbols. This function queries the MyGene.info API to translate Ensembl IDs into gene symbols. If a gene symbol cannot be found for a given ID, the original Ensembl ID is retained in the 'symbol' column. Duplicate queries are dropped, keeping only the first matched record. Parameters ---------- ensembl_list : List[str] A list of Ensembl gene IDs to be mapped (e.g., ['ENSG00000139618']). species : str, optional The species name or taxonomy ID to restrict the query. Common values include 'human' or 'mouse'. Default is 'human'. Returns ------- pd.DataFrame A pandas DataFrame containing two columns: - 'query': The original Ensembl ID. - 'symbol': The mapped gene symbol (or the original ID if unmapped). Raises ------ ImportError If the `mygene` package is not installed in the current environment. Examples -------- >>> ensembl_ids = ["ENSG00000139618", "ENSG00000157764", "INVALID_ID"] >>> df = ensembl_mapper(ensembl_ids, species="human") >>> print(df) query symbol 0 ENSG00000139618 BRCA2 1 ENSG00000157764 BRAF 2 INVALID_ID INVALID_ID """ try: import mygene except ImportError: raise ImportError( "mygene is not installed. Please install it using `pip install mygene`." ) mg = mygene.MyGeneInfo() gene_symbols = mg.querymany( ensembl_list, scopes="ensembl.gene", fields="symbol", species=species, as_dataframe=True, ).reset_index()[["query", "symbol"]] gene_symbols["symbol"] = gene_symbols["symbol"].fillna(gene_symbols["query"]) return gene_symbols.drop_duplicates(subset=["query"], keep="first")