Batch variant scoring

Batch variant scoring#

In this notebook, we demonstrate how to score many variants using AlphaGenome.

To prepare numerous variants for scoring, provide the following information as columns in a tab-separated Variant Call Format (VCF) file:

variant_id: A unique identifier for each variant.
CHROM: Chromosome, specified as a string beginning with ‘chr’ (e.g., ‘chr1’).
POS: Integer representing the base pair position (1-based; build hg38 (human) or mm10 (mouse) - see FAQ).
REF: The reference nucleotide sequence at the specified position.
ALT: The alternate nucleotide sequence at the specified position.

Tip

Open this tutorial in Google colab for interactive viewing.

# @title Install AlphaGenome

# @markdown Run this cell to install AlphaGenome.
from IPython.display import clear_output
! pip install alphagenome
clear_output()

# @title Setup and imports.

from io import StringIO
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from google.colab import data_table, files
from google.colab import userdata
import pandas as pd
from tqdm import tqdm

data_table.enable_dataframe_formatter()

# Load the model.
dna_model = dna_client.create(userdata.get('ALPHA_GENOME_API_KEY'))

# @title Score batch of variants.

# Load VCF file containing variants.
vcf_file = 'placeholder'  # @param

# We provide an example list of variants to illustrate.
vcf_file = """variant_id\tCHROM\tPOS\tREF\tALT
chr3_58394738_A_T_b38\tchr3\t58394738\tA\tT
chr8_28520_G_C_b38\tchr8\t28520\tG\tC
chr16_636337_G_A_b38\tchr16\t636337\tG\tA
chr16_1135446_G_T_b38\tchr16\t1135446\tG\tT
"""

vcf = pd.read_csv(StringIO(vcf_file), sep='\t')

required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
  if column not in vcf.columns:
    raise ValueError(f'VCF file is missing required column: {column}.')

organism = 'human'  # @param ["human", "mouse"] {type:"string"}

# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB'  # @param ["2KB", "16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{sequence_length}'
]

# @markdown Specify which scorers to use to score your variants:
score_rna_seq = True  # @param { type: "boolean"}
score_cage = True  # @param { type: "boolean" }
score_procap = True  # @param { type: "boolean" }
score_atac = True  # @param { type: "boolean" }
score_dnase = True  # @param { type: "boolean" }
score_chip_histone = True  # @param { type: "boolean" }
score_chip_tf = True  # @param { type: "boolean" }
score_polyadenylation = True  # @param { type: "boolean" }
score_splice_sites = True  # @param { type: "boolean" }
score_splice_site_usage = True  # @param { type: "boolean" }
score_splice_junctions = True  # @param { type: "boolean" }

# @markdown Other settings:
download_predictions = False  # @param { type: "boolean" }

# Parse organism specification.
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]

# Parse scorer specification.
scorer_selections = {
    'rna_seq': score_rna_seq,
    'cage': score_cage,
    'procap': score_procap,
    'atac': score_atac,
    'dnase': score_dnase,
    'chip_histone': score_chip_histone,
    'chip_tf': score_chip_tf,
    'polyadenylation': score_polyadenylation,
    'splice_sites': score_splice_sites,
    'splice_site_usage': score_splice_site_usage,
    'splice_junctions': score_splice_junctions,
}

all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
    scorer
    for scorer in selected_scorers
    if (
        organism.value
        not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
    )
    | (
        (scorer.requested_output == dna_client.OutputType.PROCAP)
        & (organism == dna_client.Organism.MUS_MUSCULUS)
    )
]
if len(unsupported_scorers) > 0:
  print(
      f'Excluding {unsupported_scorers} scorers as they are not supported for'
      f' {organism}.'
  )
  for unsupported_scorer in unsupported_scorers:
    selected_scorers.remove(unsupported_scorer)


# Score variants in the VCF file.
results = []

for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
  variant = genome.Variant(
      chromosome=str(vcf_row.CHROM),
      position=int(vcf_row.POS),
      reference_bases=vcf_row.REF,
      alternate_bases=vcf_row.ALT,
      name=vcf_row.variant_id,
  )
  interval = variant.reference_interval.resize(sequence_length)

  variant_scores = dna_model.score_variant(
      interval=interval,
      variant=variant,
      variant_scorers=selected_scorers,
      organism=organism,
  )
  results.append(variant_scores)

df_scores = variant_scorers.tidy_scores(results)

if download_predictions:
  df_scores.to_csv('variant_scores.csv', index=False)
  files.download('variant_scores.csv')

df_scores

Warning: total number of rows (121956) exceeds max_rows (20000). Falling back to pandas display.

	variant_id	scored_interval	gene_id	gene_name	gene_type	gene_strand	junction_Start	junction_End	output_type	variant_scorer	track_name	track_strand	Assay title	ontology_curie	biosample_name	biosample_type	transcription_factor	histone_mark	gtex_tissue	raw_score	quantile_score
0	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000084 ATAC-seq	.	ATAC-seq	CL:0000084	T-cell	primary_cell	NaN	NaN	NaN	-0.003778	-0.215740
1	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000100 ATAC-seq	.	ATAC-seq	CL:0000100	motor neuron	in_vitro_differentiated_cells	NaN	NaN	NaN	0.012303	0.280661
2	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000236 ATAC-seq	.	ATAC-seq	CL:0000236	B cell	primary_cell	NaN	NaN	NaN	-0.000334	-0.080577
3	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000623 ATAC-seq	.	ATAC-seq	CL:0000623	natural killer cell	primary_cell	NaN	NaN	NaN	-0.009233	-0.440600
4	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000624 ATAC-seq	.	ATAC-seq	CL:0000624	CD4-positive, alpha-beta T cell	primary_cell	NaN	NaN	NaN	-0.010510	-0.440600
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
121951	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292423	ENSG00000292423	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra...	.	polyA plus RNA-seq	UBERON:0036149	suprapubic skin	tissue	NaN	NaN	Skin_Not_Sun_Exposed_Suprapubic	0.000052	0.069106
121952	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292430	ENSG00000292430	lncRNA	+	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra...	.	polyA plus RNA-seq	UBERON:0036149	suprapubic skin	tissue	NaN	NaN	Skin_Not_Sun_Exposed_Suprapubic	0.004865	0.963724
121953	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292431	ENSG00000292431	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra...	.	polyA plus RNA-seq	UBERON:0036149	suprapubic skin	tissue	NaN	NaN	Skin_Not_Sun_Exposed_Suprapubic	-0.000141	-0.137554
121954	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292432	ENSG00000292432	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra...	.	polyA plus RNA-seq	UBERON:0036149	suprapubic skin	tissue	NaN	NaN	Skin_Not_Sun_Exposed_Suprapubic	0.000508	0.363304
121955	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292433	ENSG00000292433	lncRNA	+	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra...	.	polyA plus RNA-seq	UBERON:0036149	suprapubic skin	tissue	NaN	NaN	Skin_Not_Sun_Exposed_Suprapubic	-0.001741	-0.737944

121956 rows × 21 columns

Note that the resulting output dataframe could be quite large, especially if you use many scorers for scoring. Very large dataframes can’t be filtered interactively, but you can interact with them using the standard pandas commands:

# Examine just the effects of the variants on T-cells.
columns = [c for c in df_scores.columns if c != 'ontology_curie']
df_scores[(df_scores['ontology_curie'] == 'CL:0000084')][columns]

	variant_id	scored_interval	gene_id	gene_name	gene_type	gene_strand	junction_Start	junction_End	output_type	variant_scorer	track_name	track_strand	Assay title	biosample_name	biosample_type	transcription_factor	histone_mark	gtex_tissue	raw_score	quantile_score
0	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	ATAC	CenterMaskScorer(requested_output=ATAC, width=...	CL:0000084 ATAC-seq	.	ATAC-seq	T-cell	primary_cell	NaN	NaN	NaN	-0.003778	-0.215740
168	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	DNASE	CenterMaskScorer(requested_output=DNASE, width...	CL:0000084 DNase-seq	.	DNase-seq	T-cell	primary_cell	NaN	NaN	NaN	-0.030633	-0.680793
2109	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	CHIP_HISTONE	CenterMaskScorer(requested_output=CHIP_HISTONE...	CL:0000084 Histone ChIP-seq H3K27ac	.	Histone ChIP-seq	T-cell	primary_cell	NaN	H3K27ac	NaN	-0.001469	-0.373276
2110	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	CHIP_HISTONE	CenterMaskScorer(requested_output=CHIP_HISTONE...	CL:0000084 Histone ChIP-seq H3K27me3	.	Histone ChIP-seq	T-cell	primary_cell	NaN	H3K27me3	NaN	0.001324	0.171333
2111	chr3:58394738:A>T	chr3:57870450-58919026:.	None	None	None	None	None	None	CHIP_HISTONE	CenterMaskScorer(requested_output=CHIP_HISTONE...	CL:0000084 Histone ChIP-seq H3K36me3	.	Histone ChIP-seq	T-cell	primary_cell	NaN	H3K36me3	NaN	-0.002098	-0.383161
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95033	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000290756	ENSG00000290756	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	CL:0000084 total RNA-seq	-	total RNA-seq	T-cell	primary_cell	NaN	NaN		-0.000237	-0.312224
95034	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292400	ENSG00000292400	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	CL:0000084 total RNA-seq	-	total RNA-seq	T-cell	primary_cell	NaN	NaN		0.000030	0.126219
95035	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292423	ENSG00000292423	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	CL:0000084 total RNA-seq	-	total RNA-seq	T-cell	primary_cell	NaN	NaN		0.000173	0.280661
95036	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292431	ENSG00000292431	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	CL:0000084 total RNA-seq	-	total RNA-seq	T-cell	primary_cell	NaN	NaN		-0.000035	-0.126219
95037	chr16:1135446:G>T	chr16:611158-1659734:.	ENSG00000292432	ENSG00000292432	lncRNA	-	None	None	RNA_SEQ	GeneMaskLFCScorer(requested_output=RNA_SEQ)	CL:0000084 total RNA-seq	-	total RNA-seq	T-cell	primary_cell	NaN	NaN		0.000978	0.635053

524 rows × 20 columns