Source code for gpsea.analysis.predicate._variant

import typing

import hpotk

from gpsea.model import VariantClass, VariantEffect, ProteinMetadata, FeatureType
from gpsea.model.genome import Region
from ._api import VariantPredicate, AllVariantPredicate, AnyVariantPredicate
from ._predicates import (
    AlwaysTrueVariantPredicate,
    ChangeLengthPredicate,
    IsLargeImpreciseStructuralVariantPredicate,
    ProteinFeaturePredicate,
    ProteinFeatureTypePredicate,
    ProteinRegionPredicate,
    RefAlleleLengthPredicate,
    StructuralTypePredicate,
    VariantClassPredicate,
    VariantEffectPredicate,
    VariantExonPredicate,
    VariantGenePredicate,
    VariantKeyPredicate,
    VariantTranscriptPredicate,
)


# We do not need more than just one instance of these predicates.
IS_TRANSLOCATION = VariantClassPredicate(VariantClass.TRANSLOCATION)
IS_LARGE_IMPRECISE_SV = IsLargeImpreciseStructuralVariantPredicate()


[docs] def true() -> VariantPredicate: """ The most inclusive variant predicate - returns `True` for any variant whatsoever. """ return AlwaysTrueVariantPredicate.get_instance()
[docs] def allof(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that returns `True` if ALL `predicates` evaluate to `True`. This is useful for building compound predicates programmatically. **Example** Build a predicate to test if variant has a functional annotation to genes *SURF1* and *SURF2*: >>> from gpsea.analysis.predicate import allof, gene >>> genes = ('SURF1', 'SURF2',) >>> predicate = allof(gene(g) for g in genes) >>> predicate.description '(affects SURF1 AND affects SURF2)' Args: predicates: an iterable of predicates to test """ predicates = tuple(predicates) if len(predicates) == 1: # No need to wrap one predicate into a logical predicate. return predicates[0] else: return AllVariantPredicate(*predicates)
[docs] def anyof(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that returns `True` if ANY of the `predicates` evaluates to `True`. This can be useful for building compound predicates programmatically. **Example** Build a predicate to test if variant leads to a missense or nonsense change on a fictional transcript `NM_123456.7`: >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate import anyof, variant_effect >>> tx_id = 'NM_123456.7' >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) >>> predicate = anyof(variant_effect(e, tx_id) for e in effects) >>> predicate.description '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)' Args: predicates: an iterable of predicates to test """ predicates = tuple(predicates) if len(predicates) == 1: # No need to wrap one predicate into a logical predicate. return predicates[0] else: return AnyVariantPredicate(*predicates)
[docs] def variant_effect( effect: VariantEffect, tx_id: str, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` to test if the functional annotation predicts the variant to lead to a certain variant effect. **Example** Make a predicate for testing if the variant leads to a missense change on transcript `NM_123.4`: >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate import variant_effect >>> predicate = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4') >>> predicate.description 'MISSENSE_VARIANT on NM_123.4' Args: effect: the target :class:`~gpsea.model.VariantEffect` tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) """ return VariantEffectPredicate(effect, tx_id)
[docs] def variant_key(key: str) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant matches the provided `key`. Args: key: a `str` with the variant key (e.g. `X_12345_12345_C_G` or `22_10001_20000_INV`) """ return VariantKeyPredicate(key)
[docs] def gene(symbol: str) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant affects a given gene. Args: symbol: a `str` with the gene symbol (e.g. ``'FBN1'``). """ return VariantGenePredicate(symbol)
[docs] def transcript(tx_id: str) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant affects a transcript. Args: tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) """ return VariantTranscriptPredicate(tx_id)
[docs] def exon( exon: int, tx_id: str, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. .. warning:: We use 1-based numbering to number the exons, not the usual 0-based numbering of the computer science. Therefore, the first exon of the transcript has ``exon_number==1``, the second exon is ``2``, and so on ... .. warning:: We do not check if the `exon_number` spans beyond the number of exons of the given `transcript_id`! Therefore, ``exon_number==10,000`` will effectively return `False` for *all* variants!!! 😱 Well, at least the genome variants of the *Homo sapiens sapiens* taxon... Args: exon: a positive `int` with the index of the target exon (e.g. `1` for the 1st exon, `2` for the 2nd, ...) tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) """ return VariantExonPredicate(exon, tx_id)
[docs] def protein_region( region: typing.Union[typing.Tuple[int, int], Region], tx_id: str, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant overlaps with a region on a protein of a specific transcript. Example ------- Create a predicate to test if the variant overlaps with the 5th aminoacid of the protein encoded by a fictional transcript `NM_1234.5`: >>> from gpsea.analysis.predicate import protein_region >>> overlaps_with_fifth_aa = protein_region(region=(5, 5), tx_id="NM_1234.5") >>> overlaps_with_fifth_aa.description 'overlaps with [5,5] region of the protein encoded by NM_1234.5' Create a predicate to test if the variant overlaps with the first 20 aminoacid residues of the same transcript: >>> overlaps_with_first_20 = protein_region(region=(1, 20), tx_id="NM_1234.5") >>> overlaps_with_first_20.description 'overlaps with [1,20] region of the protein encoded by NM_1234.5' Args: region: a :class:`~gpsea.model.genome.Region` that gives the start and end coordinate of the region of interest on a protein strand or a tuple with 1-based coordinates. """ if isinstance(region, Region): pass elif ( isinstance(region, tuple) and len(region) == 2 and all(isinstance(r, int) and r > 0 for r in region) ): start = region[0] - 1 # Convert to 0-based end = region[1] region = Region(start=start, end=end) else: raise ValueError( f"region must be a `Region` or a tuple with two positive `int`s, but got {region}" ) return ProteinRegionPredicate(region, tx_id)
[docs] def is_large_imprecise_sv() -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant is a large structural variant (SV) without exact breakpoint coordinates. """ return IS_LARGE_IMPRECISE_SV
[docs] def is_structural_variant( threshold: int = 50, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant is a structural variant (SV). SVs are usually defined as variant affecting more than a certain number of base pairs. The thresholds vary in the literature, but here we use 50bp as a default. Any variant that affects at least `threshold` base pairs is considered an SV. Large SVs with unknown breakpoint coordinates or translocations (:class:`~gpsea.model.VariantClass.TRANSLOCATION`) are always considered as an SV. Args: threshold: a non-negative `int` with the number of base pairs that must be affected """ assert threshold >= 0, "`threshold` must be non-negative!" return ( change_length("<=", -threshold) | change_length(">=", threshold) | is_large_imprecise_sv() | IS_TRANSLOCATION )
[docs] def structural_type( curie: typing.Union[str, hpotk.TermId], ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant has a certain structural type. We recommend using a descendant of `structural_variant` (`SO:0001537 <https://purl.obolibrary.org/obo/SO_0001537>`_) as the structural type. **Example** Make a predicate for testing if the variant is a chromosomal deletion (`SO:1000029`): >>> from gpsea.analysis.predicate import structural_type >>> predicate = structural_type('SO:1000029') >>> predicate.description 'structural type is SO:1000029' Args: curie: compact uniform resource identifier (CURIE) with the structural type to test. """ return StructuralTypePredicate.from_curie(curie)
[docs] def variant_class( variant_class: VariantClass, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant is of a certain :class:`~gpsea.model.VariantClass`. **Example** Make a predicate to test if the variant is a deletion: >>> from gpsea.model import VariantClass >>> from gpsea.analysis.predicate import variant_class >>> predicate = variant_class(VariantClass.DEL) >>> predicate.description 'variant class is DEL' Args: variant_class: the variant class to test. """ return VariantClassPredicate( query=variant_class, )
[docs] def ref_length( operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], length: int, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele of variant is above, below, or (not) equal to certain `length`. .. seealso:: See :ref:`length-of-the-reference-allele` for more info. **Example** Prepare a predicate that tests that the REF allele includes more than 5 base pairs: >>> from gpsea.analysis.predicate import ref_length >>> predicate = ref_length('>', 5) >>> predicate.description 'reference allele length > 5' Args: operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. length: a non-negative `int` with the length threshold. """ return RefAlleleLengthPredicate(operator, length)
[docs] def change_length( operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], threshold: int, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant's change length is above, below, or (not) equal to certain `threshold`. .. seealso:: See :ref:`change-length-of-an-allele` for more info. **Example** Make a predicate for testing if the change length is less than or equal to `-10`, e.g. to test if a variant is a *deletion* leading to removal of at least 10 base pairs: >>> from gpsea.analysis.predicate import change_length >>> predicate = change_length('<=', -10) >>> predicate.description 'change length <= -10' Args: operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. threshold: an `int` with the threshold. Can be negative, zero, or positive. """ return ChangeLengthPredicate(operator, threshold)
[docs] def is_structural_deletion( threshold: int = -50, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` for testing if the variant is a `chromosomal deletion <https://purl.obolibrary.org/obo/SO_1000029>`_ or a structural variant deletion that leads to removal of at least *n* base pairs (50bp by default). .. note:: The predicate uses :meth:`~gpsea.model.VariantCoordinates.change_length` to determine if the length of the variant is above or below `threshold`. **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info. **Example** Prepare a predicate for testing if the variant is a chromosomal deletion that removes at least 20 base pairs: >>> from gpsea.analysis.predicate import is_structural_deletion >>> predicate = is_structural_deletion(-20) >>> predicate.description '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))' Args: threshold: an `int` with the change length threshold to determine if a variant is "structural" (-50 bp by default). """ chromosomal_deletion = "SO:1000029" return structural_type(chromosomal_deletion) | ( variant_class(VariantClass.DEL) & change_length("<=", threshold) )
[docs] def protein_feature_type( feature_type: typing.Union[FeatureType, str], protein_metadata: ProteinMetadata, ) -> VariantPredicate: """ Prepare a :class:`~gpsea.analysis.predicate.VariantPredicate` to test if the variant affects a `feature_type` of a protein. Args: feature_type: the target protein :class:`~gpsea.model.FeatureType` (e.g. :class:`~gpsea.model.FeatureType.DOMAIN`). protein_metadata: the information about the protein. """ if isinstance(feature_type, FeatureType): FeatureType.deprecation_warning() feature_type = feature_type.name return ProteinFeatureTypePredicate( feature_type=feature_type, protein_metadata=protein_metadata, )
[docs] def protein_feature( feature_id: str, protein_metadata: ProteinMetadata, ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` to test if the variant affects a protein feature labeled with the provided `feature_id`. Args: feature_id: the id of the target protein feature (e.g. `ANK 1`) protein_metadata: the information about the protein. """ return ProteinFeaturePredicate( feature_id=feature_id, protein_metadata=protein_metadata, )