Batch variant scoring#
In this notebook, we demonstrate how to score many variants using AlphaGenome.
To prepare numerous variants for scoring, provide the following information as columns in a tab-separated Variant Call Format (VCF) file:
variant_id
: A unique identifier for each variant.CHROM
: Chromosome, specified as a string beginning with ‘chr’ (e.g., ‘chr1’).POS
: Integer representing the base pair position (1-based; build hg38 (human) or mm10 (mouse) - see FAQ).REF
: The reference nucleotide sequence at the specified position.ALT
: The alternate nucleotide sequence at the specified position.
Tip
Open this tutorial in Google colab for interactive viewing.
# @title Install AlphaGenome
# @markdown Run this cell to install AlphaGenome.
from IPython.display import clear_output
! pip install alphagenome
clear_output()
# @title Setup and imports.
from io import StringIO
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from google.colab import data_table, files
from google.colab import userdata
import pandas as pd
from tqdm import tqdm
data_table.enable_dataframe_formatter()
# Load the model.
dna_model = dna_client.create(userdata.get('ALPHA_GENOME_API_KEY'))
# @title Score batch of variants.
# Load VCF file containing variants.
vcf_file = 'placeholder' # @param
# We provide an example list of variants to illustrate.
vcf_file = """variant_id\tCHROM\tPOS\tREF\tALT
chr3_58394738_A_T_b38\tchr3\t58394738\tA\tT
chr8_28520_G_C_b38\tchr8\t28520\tG\tC
chr16_636337_G_A_b38\tchr16\t636337\tG\tA
chr16_1135446_G_T_b38\tchr16\t1135446\tG\tT
"""
vcf = pd.read_csv(StringIO(vcf_file), sep='\t')
required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
if column not in vcf.columns:
raise ValueError(f'VCF file is missing required column: {column}.')
organism = 'human' # @param ["human", "mouse"] {type:"string"}
# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB' # @param ["2KB", "16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
f'SEQUENCE_LENGTH_{sequence_length}'
]
# @markdown Specify which scorers to use to score your variants:
score_rna_seq = True # @param { type: "boolean"}
score_cage = True # @param { type: "boolean" }
score_procap = True # @param { type: "boolean" }
score_atac = True # @param { type: "boolean" }
score_dnase = True # @param { type: "boolean" }
score_chip_histone = True # @param { type: "boolean" }
score_chip_tf = True # @param { type: "boolean" }
score_polyadenylation = True # @param { type: "boolean" }
score_splice_sites = True # @param { type: "boolean" }
score_splice_site_usage = True # @param { type: "boolean" }
score_splice_junctions = True # @param { type: "boolean" }
# @markdown Other settings:
download_predictions = False # @param { type: "boolean" }
# Parse organism specification.
organism_map = {
'human': dna_client.Organism.HOMO_SAPIENS,
'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]
# Parse scorer specification.
scorer_selections = {
'rna_seq': score_rna_seq,
'cage': score_cage,
'procap': score_procap,
'atac': score_atac,
'dnase': score_dnase,
'chip_histone': score_chip_histone,
'chip_tf': score_chip_tf,
'polyadenylation': score_polyadenylation,
'splice_sites': score_splice_sites,
'splice_site_usage': score_splice_site_usage,
'splice_junctions': score_splice_junctions,
}
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
all_scorers[key]
for key in all_scorers
if scorer_selections.get(key.lower(), False)
]
# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
scorer
for scorer in selected_scorers
if (
organism.value
not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
)
| (
(scorer.requested_output == dna_client.OutputType.PROCAP)
& (organism == dna_client.Organism.MUS_MUSCULUS)
)
]
if len(unsupported_scorers) > 0:
print(
f'Excluding {unsupported_scorers} scorers as they are not supported for'
f' {organism}.'
)
for unsupported_scorer in unsupported_scorers:
selected_scorers.remove(unsupported_scorer)
# Score variants in the VCF file.
results = []
for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
variant = genome.Variant(
chromosome=str(vcf_row.CHROM),
position=int(vcf_row.POS),
reference_bases=vcf_row.REF,
alternate_bases=vcf_row.ALT,
name=vcf_row.variant_id,
)
interval = variant.reference_interval.resize(sequence_length)
variant_scores = dna_model.score_variant(
interval=interval,
variant=variant,
variant_scorers=selected_scorers,
organism=organism,
)
results.append(variant_scores)
df_scores = variant_scorers.tidy_scores(results)
if download_predictions:
df_scores.to_csv('variant_scores.csv', index=False)
files.download('variant_scores.csv')
df_scores
Warning: total number of rows (121956) exceeds max_rows (20000). Falling back to pandas display.
variant_id | scored_interval | gene_id | gene_name | gene_type | gene_strand | junction_Start | junction_End | output_type | variant_scorer | track_name | track_strand | Assay title | ontology_curie | biosample_name | biosample_type | transcription_factor | histone_mark | gtex_tissue | raw_score | quantile_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000084 ATAC-seq | . | ATAC-seq | CL:0000084 | T-cell | primary_cell | NaN | NaN | NaN | -0.003778 | -0.215740 |
1 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000100 ATAC-seq | . | ATAC-seq | CL:0000100 | motor neuron | in_vitro_differentiated_cells | NaN | NaN | NaN | 0.012303 | 0.280661 |
2 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000236 ATAC-seq | . | ATAC-seq | CL:0000236 | B cell | primary_cell | NaN | NaN | NaN | -0.000334 | -0.080577 |
3 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000623 ATAC-seq | . | ATAC-seq | CL:0000623 | natural killer cell | primary_cell | NaN | NaN | NaN | -0.009233 | -0.440600 |
4 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000624 ATAC-seq | . | ATAC-seq | CL:0000624 | CD4-positive, alpha-beta T cell | primary_cell | NaN | NaN | NaN | -0.010510 | -0.440600 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
121951 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292423 | ENSG00000292423 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... | . | polyA plus RNA-seq | UBERON:0036149 | suprapubic skin | tissue | NaN | NaN | Skin_Not_Sun_Exposed_Suprapubic | 0.000052 | 0.069106 |
121952 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292430 | ENSG00000292430 | lncRNA | + | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... | . | polyA plus RNA-seq | UBERON:0036149 | suprapubic skin | tissue | NaN | NaN | Skin_Not_Sun_Exposed_Suprapubic | 0.004865 | 0.963724 |
121953 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292431 | ENSG00000292431 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... | . | polyA plus RNA-seq | UBERON:0036149 | suprapubic skin | tissue | NaN | NaN | Skin_Not_Sun_Exposed_Suprapubic | -0.000141 | -0.137554 |
121954 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292432 | ENSG00000292432 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... | . | polyA plus RNA-seq | UBERON:0036149 | suprapubic skin | tissue | NaN | NaN | Skin_Not_Sun_Exposed_Suprapubic | 0.000508 | 0.363304 |
121955 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292433 | ENSG00000292433 | lncRNA | + | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | UBERON:0036149 gtex Skin_Not_Sun_Exposed_Supra... | . | polyA plus RNA-seq | UBERON:0036149 | suprapubic skin | tissue | NaN | NaN | Skin_Not_Sun_Exposed_Suprapubic | -0.001741 | -0.737944 |
121956 rows × 21 columns
Note that the resulting output dataframe could be quite large, especially if you use many scorers for scoring. Very large dataframes can’t be filtered interactively, but you can interact with them using the standard pandas
commands:
# Examine just the effects of the variants on T-cells.
columns = [c for c in df_scores.columns if c != 'ontology_curie']
df_scores[(df_scores['ontology_curie'] == 'CL:0000084')][columns]
variant_id | scored_interval | gene_id | gene_name | gene_type | gene_strand | junction_Start | junction_End | output_type | variant_scorer | track_name | track_strand | Assay title | biosample_name | biosample_type | transcription_factor | histone_mark | gtex_tissue | raw_score | quantile_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | ATAC | CenterMaskScorer(requested_output=ATAC, width=... | CL:0000084 ATAC-seq | . | ATAC-seq | T-cell | primary_cell | NaN | NaN | NaN | -0.003778 | -0.215740 |
168 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | DNASE | CenterMaskScorer(requested_output=DNASE, width... | CL:0000084 DNase-seq | . | DNase-seq | T-cell | primary_cell | NaN | NaN | NaN | -0.030633 | -0.680793 |
2109 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | CHIP_HISTONE | CenterMaskScorer(requested_output=CHIP_HISTONE... | CL:0000084 Histone ChIP-seq H3K27ac | . | Histone ChIP-seq | T-cell | primary_cell | NaN | H3K27ac | NaN | -0.001469 | -0.373276 |
2110 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | CHIP_HISTONE | CenterMaskScorer(requested_output=CHIP_HISTONE... | CL:0000084 Histone ChIP-seq H3K27me3 | . | Histone ChIP-seq | T-cell | primary_cell | NaN | H3K27me3 | NaN | 0.001324 | 0.171333 |
2111 | chr3:58394738:A>T | chr3:57870450-58919026:. | None | None | None | None | None | None | CHIP_HISTONE | CenterMaskScorer(requested_output=CHIP_HISTONE... | CL:0000084 Histone ChIP-seq H3K36me3 | . | Histone ChIP-seq | T-cell | primary_cell | NaN | H3K36me3 | NaN | -0.002098 | -0.383161 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95033 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000290756 | ENSG00000290756 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | CL:0000084 total RNA-seq | - | total RNA-seq | T-cell | primary_cell | NaN | NaN | -0.000237 | -0.312224 | |
95034 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292400 | ENSG00000292400 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | CL:0000084 total RNA-seq | - | total RNA-seq | T-cell | primary_cell | NaN | NaN | 0.000030 | 0.126219 | |
95035 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292423 | ENSG00000292423 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | CL:0000084 total RNA-seq | - | total RNA-seq | T-cell | primary_cell | NaN | NaN | 0.000173 | 0.280661 | |
95036 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292431 | ENSG00000292431 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | CL:0000084 total RNA-seq | - | total RNA-seq | T-cell | primary_cell | NaN | NaN | -0.000035 | -0.126219 | |
95037 | chr16:1135446:G>T | chr16:611158-1659734:. | ENSG00000292432 | ENSG00000292432 | lncRNA | - | None | None | RNA_SEQ | GeneMaskLFCScorer(requested_output=RNA_SEQ) | CL:0000084 total RNA-seq | - | total RNA-seq | T-cell | primary_cell | NaN | NaN | 0.000978 | 0.635053 |
524 rows × 20 columns