Batch variant scoring

Batch variant scoring#

In this notebook, we demonstrate how to score many variants using AlphaGenome.

To prepare numerous variants for scoring, provide the following information as columns in a tab-separated Variant Call Format (VCF) file:

  • variant_id: A unique identifier for each variant.

  • CHROM: Chromosome, specified as a string beginning with ‘chr’ (e.g., ‘chr1’).

  • POS: Integer representing the base pair position (1-based; build hg38 (human) or mm10 (mouse) - see FAQ).

  • REF: The reference nucleotide sequence at the specified position.

  • ALT: The alternate nucleotide sequence at the specified position.

Tip

Open this tutorial in Google colab for interactive viewing.

# @title Install AlphaGenome

# @markdown Run this cell to install AlphaGenome.
from IPython.display import clear_output
! pip install alphagenome
clear_output()
# @title Setup and imports.

from io import StringIO
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from google.colab import data_table, files
from google.colab import userdata
import pandas as pd
from tqdm import tqdm

data_table.enable_dataframe_formatter()

# Load the model.
dna_model = dna_client.create(userdata.get('ALPHA_GENOME_API_KEY'))
# @title Score batch of variants.

# Load VCF file containing variants.
vcf_file = 'placeholder'  # @param

# We provide an example list of variants to illustrate.
vcf_file = """variant_id\tCHROM\tPOS\tREF\tALT
chr3_58394738_A_T_b38\tchr3\t58394738\tA\tT
chr8_28520_G_C_b38\tchr8\t28520\tG\tC
chr16_636337_G_A_b38\tchr16\t636337\tG\tA
chr16_1135446_G_T_b38\tchr16\t1135446\tG\tT
"""

vcf = pd.read_csv(StringIO(vcf_file), sep='\t')

required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
  if column not in vcf.columns:
    raise ValueError(f'VCF file is missing required column: {column}.')

organism = 'human'  # @param ["human", "mouse"] {type:"string"}

# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB'  # @param ["2KB", "16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{sequence_length}'
]

# @markdown Specify which scorers to use to score your variants:
score_rna_seq = True  # @param { type: "boolean"}
score_cage = True  # @param { type: "boolean" }
score_procap = True  # @param { type: "boolean" }
score_atac = True  # @param { type: "boolean" }
score_dnase = True  # @param { type: "boolean" }
score_chip_histone = True  # @param { type: "boolean" }
score_chip_tf = True  # @param { type: "boolean" }
score_polyadenylation = True  # @param { type: "boolean" }
score_splice_sites = True  # @param { type: "boolean" }
score_splice_site_usage = True  # @param { type: "boolean" }
score_splice_junctions = True  # @param { type: "boolean" }

# @markdown Other settings:
download_predictions = False  # @param { type: "boolean" }

# Parse organism specification.
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]

# Parse scorer specification.
scorer_selections = {
    'rna_seq': score_rna_seq,
    'cage': score_cage,
    'procap': score_procap,
    'atac': score_atac,
    'dnase': score_dnase,
    'chip_histone': score_chip_histone,
    'chip_tf': score_chip_tf,
    'polyadenylation': score_polyadenylation,
    'splice_sites': score_splice_sites,
    'splice_site_usage': score_splice_site_usage,
    'splice_junctions': score_splice_junctions,
}

all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
    scorer
    for scorer in selected_scorers
    if (
        organism.value
        not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
    )
    | (
        (scorer.requested_output == dna_client.OutputType.PROCAP)
        & (organism == dna_client.Organism.MUS_MUSCULUS)
    )
]
if len(unsupported_scorers) > 0:
  print(
      f'Excluding {unsupported_scorers} scorers as they are not supported for'
      f' {organism}.'
  )
  for unsupported_scorer in unsupported_scorers:
    selected_scorers.remove(unsupported_scorer)


# Score variants in the VCF file.
results = []

for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
  variant = genome.Variant(
      chromosome=str(vcf_row.CHROM),
      position=int(vcf_row.POS),
      reference_bases=vcf_row.REF,
      alternate_bases=vcf_row.ALT,
      name=vcf_row.variant_id,
  )
  interval = variant.reference_interval.resize(sequence_length)

  variant_scores = dna_model.score_variant(
      interval=interval,
      variant=variant,
      variant_scorers=selected_scorers,
      organism=organism,
  )
  results.append(variant_scores)

df_scores = variant_scorers.tidy_scores(results)

if download_predictions:
  df_scores.to_csv('variant_scores.csv', index=False)
  files.download('variant_scores.csv')

df_scores
Warning: total number of rows (121956) exceeds max_rows (20000). Falling back to pandas display.
variant_id scored_interval gene_id gene_name gene_type gene_strand junction_Start junction_End output_type variant_scorer track_name track_strand Assay title ontology_curie biosample_name biosample_type transcription_factor histone_mark gtex_tissue raw_score quantile_score
0 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000084 ATAC-seq . ATAC-seq CL:0000084 T-cell primary_cell NaN NaN NaN -0.003778 -0.215740
1 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000100 ATAC-seq . ATAC-seq CL:0000100 motor neuron in_vitro_differentiated_cells NaN NaN NaN 0.012303 0.280661
2 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000236 ATAC-seq . ATAC-seq CL:0000236 B cell primary_cell NaN NaN NaN -0.000334 -0.080577
3 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000623 ATAC-seq . ATAC-seq CL:0000623 natural killer cell primary_cell NaN NaN NaN -0.009233 -0.440600
4 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000624 ATAC-seq . ATAC-seq CL:0000624 CD4-positive, alpha-beta T cell primary_cell NaN NaN NaN -0.010510 -0.440600
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
121951 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292423 ENSG00000292423 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... . polyA plus RNA-seq UBERON:0036149 suprapubic skin tissue NaN NaN Skin_Not_Sun_Exposed_Suprapubic 0.000052 0.069106
121952 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292430 ENSG00000292430 lncRNA + None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... . polyA plus RNA-seq UBERON:0036149 suprapubic skin tissue NaN NaN Skin_Not_Sun_Exposed_Suprapubic 0.004865 0.963724
121953 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292431 ENSG00000292431 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... . polyA plus RNA-seq UBERON:0036149 suprapubic skin tissue NaN NaN Skin_Not_Sun_Exposed_Suprapubic -0.000141 -0.137554
121954 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292432 ENSG00000292432 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... . polyA plus RNA-seq UBERON:0036149 suprapubic skin tissue NaN NaN Skin_Not_Sun_Exposed_Suprapubic 0.000508 0.363304
121955 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292433 ENSG00000292433 lncRNA + None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... . polyA plus RNA-seq UBERON:0036149 suprapubic skin tissue NaN NaN Skin_Not_Sun_Exposed_Suprapubic -0.001741 -0.737944

121956 rows × 21 columns

Note that the resulting output dataframe could be quite large, especially if you use many scorers for scoring. Very large dataframes can’t be filtered interactively, but you can interact with them using the standard pandas commands:

# Examine just the effects of the variants on T-cells.
columns = [c for c in df_scores.columns if c != 'ontology_curie']
df_scores[(df_scores['ontology_curie'] == 'CL:0000084')][columns]
variant_id scored_interval gene_id gene_name gene_type gene_strand junction_Start junction_End output_type variant_scorer track_name track_strand Assay title biosample_name biosample_type transcription_factor histone_mark gtex_tissue raw_score quantile_score
0 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None ATAC CenterMaskScorer(requested_output=ATAC, width=... CL:0000084 ATAC-seq . ATAC-seq T-cell primary_cell NaN NaN NaN -0.003778 -0.215740
168 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None DNASE CenterMaskScorer(requested_output=DNASE, width... CL:0000084 DNase-seq . DNase-seq T-cell primary_cell NaN NaN NaN -0.030633 -0.680793
2109 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None CHIP_HISTONE CenterMaskScorer(requested_output=CHIP_HISTONE... CL:0000084 Histone ChIP-seq H3K27ac . Histone ChIP-seq T-cell primary_cell NaN H3K27ac NaN -0.001469 -0.373276
2110 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None CHIP_HISTONE CenterMaskScorer(requested_output=CHIP_HISTONE... CL:0000084 Histone ChIP-seq H3K27me3 . Histone ChIP-seq T-cell primary_cell NaN H3K27me3 NaN 0.001324 0.171333
2111 chr3:58394738:A>T chr3:57870450-58919026:. None None None None None None CHIP_HISTONE CenterMaskScorer(requested_output=CHIP_HISTONE... CL:0000084 Histone ChIP-seq H3K36me3 . Histone ChIP-seq T-cell primary_cell NaN H3K36me3 NaN -0.002098 -0.383161
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95033 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000290756 ENSG00000290756 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) CL:0000084 total RNA-seq - total RNA-seq T-cell primary_cell NaN NaN -0.000237 -0.312224
95034 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292400 ENSG00000292400 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) CL:0000084 total RNA-seq - total RNA-seq T-cell primary_cell NaN NaN 0.000030 0.126219
95035 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292423 ENSG00000292423 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) CL:0000084 total RNA-seq - total RNA-seq T-cell primary_cell NaN NaN 0.000173 0.280661
95036 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292431 ENSG00000292431 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) CL:0000084 total RNA-seq - total RNA-seq T-cell primary_cell NaN NaN -0.000035 -0.126219
95037 chr16:1135446:G>T chr16:611158-1659734:. ENSG00000292432 ENSG00000292432 lncRNA - None None RNA_SEQ GeneMaskLFCScorer(requested_output=RNA_SEQ) CL:0000084 total RNA-seq - total RNA-seq T-cell primary_cell NaN NaN 0.000978 0.635053

524 rows × 20 columns