Source code for gpsea.model._variant

import abc
import enum
import typing

import hpotk

from .genome import Region, GenomicRegion, Contig, Strand
from ._gt import Genotyped, Genotypes
from ._variant_effects import VariantEffect


[docs] class TranscriptInfoAware(metaclass=abc.ABCMeta): """ The implementors know about basic gene/transcript identifiers. """ @property @abc.abstractmethod def gene_id(self) -> str: """ Returns: string: The gene symbol (e.g. SURF1) """ pass @property @abc.abstractmethod def transcript_id(self) -> str: """ Returns: string: The transcript RefSeq identifier (e.g. NM_123456.7) """ pass
[docs] class TranscriptAnnotation(TranscriptInfoAware): """ `TranscriptAnnotation` represent a result of the functional annotation of a variant with respect to single transcript of a gene. """ def __init__( self, gene_id: str, tx_id: str, hgvs_cdna: typing.Optional[str], is_preferred: bool, variant_effects: typing.Iterable[VariantEffect], affected_exons: typing.Optional[typing.Iterable[int]], protein_id: typing.Optional[str], hgvsp: typing.Optional[str], protein_effect_coordinates: typing.Optional[Region], ): self._gene_id = hpotk.util.validate_instance(gene_id, str, "gene_id") self._tx_id = hpotk.util.validate_instance(tx_id, str, "tx_id") self._hgvs_cdna = hpotk.util.validate_optional_instance( hgvs_cdna, str, "hgvs_cdna" ) self._is_preferred = hpotk.util.validate_instance( is_preferred, bool, "is_preferred" ) self._variant_effects = tuple(variant_effects) if affected_exons is not None: self._affected_exons = tuple(affected_exons) else: self._affected_exons = None if protein_id is not None: self._protein_id = hpotk.util.validate_instance( protein_id, str, "protein_id" ) else: self._protein_id = None if hgvsp is not None: self._hgvsp = hpotk.util.validate_instance(hgvsp, str, "hgvsp") else: self._hgvsp = None self._protein_effect_location = hpotk.util.validate_optional_instance( protein_effect_coordinates, Region, "protein_effect_coordinates", ) @property def gene_id(self) -> str: """ Get the HGVS symbol of the affected gene (e.g. *SURF1*). """ return self._gene_id @property def transcript_id(self) -> str: """ Get the RefSeq identifier of the transcript (e.g. `NM_123456.7`). """ return self._tx_id @property def is_preferred(self) -> bool: """ Return `True` if the transcript is the preferred transcript of a gene, such as MANE transcript, canonical Ensembl transcript. """ return self._is_preferred @property def hgvs_cdna(self) -> typing.Optional[str]: """ Get the HGVS description of the sequence variant (e.g. ``NM_123456.7:c.9876G>T``) or `None` if not available. """ return self._hgvs_cdna @property def variant_effects(self) -> typing.Sequence[VariantEffect]: """ Get a sequence of the predicted functional variant effects. """ return self._variant_effects @property def overlapping_exons(self) -> typing.Optional[typing.Sequence[int]]: """ Get a sequence of 1-based exon indices (the index of the 1st exon is `1`) that overlap with the variant. """ return self._affected_exons @property def protein_id(self) -> typing.Optional[str]: """ Get the ID of the protein encoded by the :attr:`transcript_id` or `None` if the transcript is not protein-coding. """ return self._protein_id @property def hgvsp(self) -> typing.Optional[str]: """ Get the HGVS description of the protein sequence variant (e.g. ``NP_001027559.1:p.Gln421Ter``) or `None` if not available. """ return self._hgvsp @property def protein_effect_location(self) -> typing.Optional[Region]: """ Get the :class:`~gpsea.model.genome.Region` with start and end amino-acid coordinates affected by the variant. """ return self._protein_effect_location def __str__(self) -> str: return ( f"TranscriptAnnotation(gene_id:{self._gene_id}," f"transcript_id:{self._tx_id}," f"hgvs_cdna:{self._hgvs_cdna}," f"is_preferred:{self._is_preferred}," f"variant_effects:{self._variant_effects}," f"overlapping_exons:{self._affected_exons}," f"protein_id:{self._protein_id}," f"hgvsp:{self._hgvsp}," f"protein_effect_location:{self._protein_effect_location})" ) def __eq__(self, other) -> bool: return ( isinstance(other, TranscriptAnnotation) and self._gene_id == other._gene_id and self._tx_id == other._tx_id and self._hgvs_cdna == other._hgvs_cdna and self._is_preferred == other._is_preferred and self._variant_effects == other._variant_effects and self._affected_exons == other._affected_exons and self._protein_id == other._protein_id and self._hgvsp == other._hgvsp and self._protein_effect_location == other._protein_effect_location ) def __repr__(self) -> str: return str(self) def __hash__(self) -> int: return hash( ( self._gene_id, self._tx_id, self._hgvs_cdna, self._is_preferred, self._variant_effects, self._affected_exons, self._protein_id, self._hgvsp, self._protein_effect_location, ) )
[docs] class VariantClass(enum.Enum): """ `VariantClass` represents a high-level variant category which mostly corresponds to the structural variant categories of the Variant Call Format specification, but includes type for single nucleotide variants (SNV) and multi-nucleotide variant (MNV). """ DEL = 0 """ A deletion - a variant with a net loss of sequence from the alternative allele regardless of its size. Both a deletion of 1 base pair and a deletion of 1000 base pairs are acceptable. """ DUP = 1 """ Duplication (tandem or interspersed). """ INS = 2 """ Insertion of a novel sequence. """ INV = 3 """ Inversion of a chromosome segment. """ MNV = 4 """ Multi-nucleotide variant (e.g. `AG>CT`) that is not a duplication, deletion, or insertion. May be called INDEL. """ SNV = 5 """ Single nucleotide variant. """ TRANSLOCATION = 6 """ A chromosomal translocation, which occurs when a chromosome breaks and the (typically two) fragmented pieces re-attach to different chromosomes. """
[docs] class VariantCoordinates: """ A representation of coordinates of sequence and symbolic variants. Note, the breakend variants are not currently supported. """
[docs] @staticmethod def from_vcf_literal( contig: Contig, pos: int, ref: str, alt: str, ): """ Create `VariantCoordinates` from a variant in VCF literal notation. Note, this function must *not* be used to create a VCF symbolic variant (e.g. `<DEL>` or translocation). Use :func:`from_vcf_symbolic` instead. **Example** Create a variant from a VCF line: ``` #CHROM POS ID REF ALT ... chr1 1001 . C T ``` We first must decide on the genome build. Most of the time, we should use GRCh38: >>> from gpsea.model.genome import GRCh38 >>> build = GRCh38 Then, we access the contig for ``'chr1'``: >>> chr1 = build.contig_by_name('chr1') Last, we create the variant coordinates: >>> from gpsea.model import VariantCoordinates >>> vc = VariantCoordinates.from_vcf_literal( ... contig=chr1, pos=1001, ref='C', alt='T', ... ) Now can test the properties: >>> vc.start, vc.end, vc.ref, vc.alt, len(vc), vc.change_length (1000, 1001, 'C', 'T', 1, 0) Args: contig: a :class:`~gpsea.model.genome.Contig` for the chromosome pos: a 1-based coordinate of the first base of the reference allele, as described in VCF standard ref: a `str` with the REF allele. Should meet the requirements of the VCF standard. alt: a `str` with the ALT allele. Should meet the requirements of the VCF standard. """ # TODO: test that we're getting proper alleles region = GenomicRegion( contig=contig, start=pos - 1, end=pos + len(ref) - 1, strand=Strand.POSITIVE, ) change_length = len(ref) - len(alt) return VariantCoordinates( region=region, ref=ref, alt=alt, change_length=change_length, )
[docs] @staticmethod def from_vcf_symbolic( contig: Contig, pos: int, end: int, ref: str, alt: str, svlen: int, ): """ Create `VariantCoordinates` from a variant in VCF symbolic notation. Note, this function must *not* be used to create a VCF sequence/literal variant. Use :func:`from_vcf_literal` instead. **Example** Let's create a symbolic variant from the line: ``` #CHROM POS ID REF ALT QUAL FILTER INFO 2 321682 . T <DEL> 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205 ``` We first must decide on the genome build. Most of the time, we should use GRCh38: >>> from gpsea.model.genome import GRCh38 >>> contig = GRCh38.contig_by_name('2') Now, we create the coordinates as: >>> vc = VariantCoordinates.from_vcf_symbolic( ... contig=contig, pos=321682, end=321887, ... ref='T', alt='<DEL>', svlen=-205, ... ) Now can test the properties: >>> vc.start, vc.end, vc.ref, vc.alt, len(vc), vc.change_length (321681, 321887, 'T', '<DEL>', 206, -205) Args: contig: a :class:`~gpsea.model.genome.Contig` for the chromosome pos: a 1-based coordinate of the first base of the affected reference allele region end: a 1-based coordinate of the last base of the affected reference allele region ref: a `str` with the REF allele. Most of the time, it is one of `{'N', 'A', 'C', 'G', 'T'}` alt: a `str` with the ALT allele, e.g. one of `{'<DEL>', '<DUP>', '<INS>', '<INV>'}` svlen: an `int` with change length (the difference between ref and alt allele lengths) """ # TODO: test that we're getting proper alleles region = GenomicRegion( contig=contig, start=pos - 1, # convert to 0-based coordinates, end=end, strand=Strand.POSITIVE, ) return VariantCoordinates( region=region, ref=ref, alt=alt, change_length=svlen, )
def __init__(self, region: GenomicRegion, ref: str, alt: str, change_length: int): self._region = hpotk.util.validate_instance(region, GenomicRegion, "region") self._ref = hpotk.util.validate_instance(ref, str, "ref") self._alt = hpotk.util.validate_instance(alt, str, "alt") self._change_length = hpotk.util.validate_instance( change_length, int, "change_length" ) @property def chrom(self) -> str: """ Get the label of the chromosome/contig where the variant is located. """ return self._region.contig.name @property def start(self) -> int: """ Get the 0-based start coordinate (excluded) of the first base of the :attr:`ref` allele. """ return self._region.start @property def end(self) -> int: """ Get the 0-based end coordinate (included) of the last base of the :attr:`ref` allele. """ return self._region.end @property def region(self) -> GenomicRegion: """ Get the genomic region spanned by the :attr:`ref` allele. """ return self._region @property def ref(self) -> str: """ Get the reference allele (e.g. "A", "CCT", "N"). The allele may be an empty string. """ return self._ref @property def alt(self) -> str: """ Get the alternate allele (e.g. "A", "GG", "<DEL>"). The allele may be an empty string for sequence variants. The symbolic alternate allele follow the VCF notation and use the `<` and `>` characters (e.g. "<DEL>", "<INS:ME:SINE>"). """ return self._alt @property def change_length(self) -> int: """ Get the change of length between the `ref` and `alt` alleles due to the variant presence. See :ref:`change-length-of-an-allele` for more info. """ return self._change_length @property def variant_key(self) -> str: """ Get a readable representation of the variant's coordinates. For instance, ``X_12345_12345_C_G`` for a sequence variant or ``22_10001_20000_INV`` for a symbolic variant. If the key is larger than 50 characters, the 'ref' and/or 'alt' (if over 10 bps) are changed to just show number of bps. Example: ``X_1000001_1000027_TAAAAAAAAAAAAAAAAAAAAAAAAAA_T`` -> ``X_1000001_1000027_--27bp--_T`` .. note:: Both *start* and *end* coordinates use 1-based (included) coordinate system. """ if self.is_structural(): return f"{self.chrom}_{self.start + 1}_{self.end}_{self.alt[1:-1]}" else: key = f"{self.chrom}_{self.start + 1}_{self.end}_{self.ref}_{self.alt}" if len(key) > 50: if len(self.ref) > 10: ref = f"--{len(self.ref)}bp--" else: ref = self.ref if len(self.alt) > 10: alt = f"--{len(self.alt)}bp--" else: alt = self.alt return f"{self.chrom}_{self.start + 1}_{self.end}_{ref}_{alt}" else: return key @property def variant_class(self) -> VariantClass: """ Get a :class:`VariantClass` category. """ if self.is_structural(): # Expecting a `str` like <DEL>, <INS>, <DUP>, <INV>, ... return VariantClass[self.alt[1:-1]] else: if len(self.ref) > len(self.alt): return VariantClass.DEL elif len(self.ref) < len(self.alt): # may also be a duplication, but it's hard to say from this return VariantClass.INS else: if len(self.ref) == 1: return VariantClass.SNV else: return VariantClass.MNV
[docs] def is_structural(self) -> bool: """ Checks if the variant coordinates use structural variant notation as described by Variant Call Format (`VCF <https://en.wikipedia.org/wiki/Variant_Call_Format>`_). Ane example of *structural* variant notation:: chr5 101 . N <DEL> . . SVTYPE=DEL;END=120;SVLEN=-10 as opposed to the *sequence* (literal) notation:: chr5 101 . NACGTACGTAC N :return: `True` if the variant coordinates use structural variant notation. """ return ( len(self._alt) != 0 and self._alt.startswith("<") and self._alt.endswith(">") )
def __len__(self): """ Get the number of bases on the ref allele that are affected by the variant. """ return len(self._region) def __eq__(self, other) -> bool: return ( isinstance(other, VariantCoordinates) and self.region == other.region and self.ref == other.ref and self.alt == other.alt and self.change_length == other.change_length ) def __hash__(self) -> int: return hash((self._region, self._ref, self._alt, self._change_length)) def __str__(self) -> str: return ( f"VariantCoordinates(region={self.region}, " f"ref={self.ref}, alt={self.alt}, " f"change_length={self.change_length})" ) def __repr__(self) -> str: return str(self)
[docs] class ImpreciseSvInfo: """ Data regarding a structural variant (SV) with imprecise breakpoint coordinates. """ def __init__( self, structural_type: hpotk.TermId, variant_class: VariantClass, gene_id: str, gene_symbol: str, ): self._structural_type = hpotk.util.validate_instance( structural_type, hpotk.TermId, "structural_type" ) self._variant_class = hpotk.util.validate_instance( variant_class, VariantClass, "variant_class" ) self._gene_id = gene_id self._gene_symbol = gene_symbol @property def structural_type(self) -> hpotk.TermId: """ Get term ID of the structural type (e.g. ``SO:1000029`` for chromosomal deletion). """ return self._structural_type @property def variant_class(self) -> VariantClass: """ Get a :class:`VariantClass` category. """ return self._variant_class @property def gene_id(self) -> str: """ Get a `str` with gene identifier CURIE (e.g. ``HGNC:3603``) or `None` if the identifier is not available. """ return self._gene_id @property def gene_symbol(self) -> str: """ Get a `str` with HGVS gene symbol (e.g. *FBN1*) or `None` if the symbol is not available. """ return self._gene_symbol @property def variant_key(self) -> str: """ Get a readable representation of the variant. """ return f"{self._structural_type}_{self._gene_id}_{self._gene_symbol}" def __eq__(self, value: object) -> bool: return ( isinstance(value, ImpreciseSvInfo) and self._structural_type == value._structural_type and self._variant_class == value._variant_class and self._gene_id == value._gene_id and self._gene_symbol == value._gene_symbol ) def __hash__(self) -> int: return hash( ( self._structural_type, self._variant_class, self._gene_id, self._gene_symbol, ) ) def __str__(self) -> str: return ( f"ImpreciseSv(" f"structural_type={self._structural_type}, " f"variant_class={self._variant_class}, " f"gene_id={self._gene_id}, " f"gene_symbol={self._gene_symbol})" ) def __repr__(self) -> str: return str(self)
[docs] class VariantInfo: """ `VariantInfo` consists of either variant coordinates or imprecise SV data. The class is conceptually similar to Rust enum - only one of the fields can be set at any point in time. """ def __init__( self, variant_coordinates: typing.Optional[VariantCoordinates] = None, sv_info: typing.Optional[ImpreciseSvInfo] = None, ): if (variant_coordinates is None) != (sv_info is None): # At most one field can be set. self._variant_coordinates = variant_coordinates self._sv_info = sv_info else: raise ValueError( f"Only one field can be set: variant_coordinates={variant_coordinates}, sv_info={sv_info}" ) @property def variant_coordinates(self) -> typing.Optional[VariantCoordinates]: """ Get variant coordinates if available. """ return self._variant_coordinates
[docs] def has_variant_coordinates(self) -> bool: """ Returns `True` if the variant coordinates are available. """ return self.variant_coordinates is not None
@property def sv_info(self) -> typing.Optional[ImpreciseSvInfo]: """ Get information about large imprecise SV. """ return self._sv_info
[docs] def has_sv_info(self) -> bool: """ Returns `True` if the variant is a large imprecise SV and the exact coordinates are thus unavailable. """ return self.sv_info is not None
@property def variant_key(self) -> str: """ Get a readable representation of the variant's coordinates or the large SV info. """ if self.has_variant_coordinates(): return self.variant_coordinates.variant_key elif self.has_sv_info(): return self.sv_info.variant_key else: self._handle_missing_state() @property def variant_class(self) -> VariantClass: """ Get a :class:`VariantClass` category. """ if self.has_variant_coordinates(): return self.variant_coordinates.variant_class elif self.has_sv_info(): return self.sv_info.variant_class else: self._handle_missing_state()
[docs] def is_structural(self) -> bool: """ Test if the variant is a structural variant. This can either be because the variant coordinates use the structural variant notation (see :meth:`VariantCoordinates.is_structural`) or if the variant is large imprecise SV. """ if self.has_variant_coordinates(): return self.variant_coordinates.is_structural() elif self.has_sv_info(): return True else: self._handle_missing_state()
def _handle_missing_state(self): raise ValueError( "VariantInfo should have either variant coordinates or SV info!" ) def __eq__(self, value: object) -> bool: return ( isinstance(value, VariantInfo) and self._variant_coordinates == value._variant_coordinates and self._sv_info == value._sv_info ) def __hash__(self) -> int: return hash((self._variant_coordinates, self._sv_info)) def __str__(self) -> str: return ( f"VariantInfo(" f"variant_coordinates={self._variant_coordinates}, " f"sv_info={self._sv_info})" ) def __repr__(self) -> str: return str(self)
[docs] class VariantInfoAware(metaclass=abc.ABCMeta): """ An entity where :class:`VariantInfo` is available. """ @property @abc.abstractmethod def variant_info(self) -> VariantInfo: """ Get the variant data with coordinates or other info available for large imprecise SVs. """ pass
[docs] class FunctionalAnnotationAware(metaclass=abc.ABCMeta): @property @abc.abstractmethod def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]: pass
[docs] def get_tx_anno_by_tx_id( self, transcript_id: str ) -> typing.Optional[TranscriptAnnotation]: """Given a transcript ID, this will return the `TranscriptAnnotation` associated with that variant and transcript. Args: transcript_id (str): A transcript ID - i.e. 'NM_170707.4' Returns: typing.Optional[TranscriptAnnotation]: The Transcript Annotation if available. """ for tx_ann in self.tx_annotations: if tx_ann.transcript_id == transcript_id: return tx_ann return None
[docs] def get_hgvs_cdna_by_tx_id(self, transcript_id: str) -> typing.Optional[str]: """Given a transcript ID, will return the hgvs cdna string associated with that variant and transcript. Args: transcript_id (str): A transcript ID - i.e. 'NM_170707.4' Returns: str or None: The hgvs cdna if available - i.e. 'NM_170707.4:c.1824C>T' """ for tx_ann in self.tx_annotations: if tx_ann.transcript_id == transcript_id: return tx_ann.hgvs_cdna return None
[docs] def get_preferred_tx_annotation(self) -> typing.Optional[TranscriptAnnotation]: """Get the `TranscriptAnnotation` that represents the result of the functional annotation with respect to the preferred transcript of a gene. Returns `None` if transcript annotations is no preferred transcript found. Returns: typing.Optional[TranscriptAnnotation]: The `TranscriptAnnotation` with respect to the preferred transcript or `None` if the preferred transcript info is not available. """ for tx in self.tx_annotations: if tx.is_preferred: return tx return None
[docs] class Variant(VariantInfoAware, FunctionalAnnotationAware, Genotyped): """ Variant includes three lines of information: * the variant data with coordinates or other info available for large imprecise SVs, * results of the functional annotation with respect to relevant transcripts, and * the genotypes for the known samples """
[docs] @staticmethod def create_variant_from_scratch( variant_coordinates: VariantCoordinates, gene_name: str, trans_id: str, hgvs_cdna: str, is_preferred: bool, consequences: typing.Iterable[VariantEffect], exons_effected: typing.Sequence[int], protein_id: typing.Optional[str], hgvsp: typing.Optional[str], protein_effect_start: typing.Optional[int], protein_effect_end: typing.Optional[int], genotypes: Genotypes, ): variant_info = VariantInfo(variant_coordinates=variant_coordinates) if protein_effect_start is not None and protein_effect_end is not None: protein_effect = Region(protein_effect_start, protein_effect_end) else: protein_effect = None transcript = TranscriptAnnotation( gene_name, trans_id, hgvs_cdna, is_preferred, consequences, exons_effected, protein_id, hgvsp, protein_effect, ) return Variant(variant_info, (transcript,), genotypes)
def __init__( self, variant_info: VariantInfo, tx_annotations: typing.Iterable[TranscriptAnnotation], genotypes: Genotypes, ): self._variant_info = variant_info self._tx_annotations = tuple(tx_annotations) self._gts = genotypes @property def variant_info(self) -> VariantInfo: """ Get the representation of the variant data for sequence and symbolic variants, as well as for large imprecise SVs. """ return self._variant_info @property def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]: """A collection of TranscriptAnnotations that each represent results of the functional annotation of a variant with respect to single transcript of a gene. Returns: Sequence[TranscriptAnnotation]: A sequence of TranscriptAnnotation objects """ return self._tx_annotations @property def genotypes(self) -> Genotypes: return self._gts def __eq__(self, other) -> bool: return ( isinstance(other, Variant) and self._variant_info == other._variant_info and self._tx_annotations == other._tx_annotations and self._gts == other._gts ) def __hash__(self) -> int: return hash((self._variant_info, self._tx_annotations, self._gts)) def __repr__(self) -> str: return str(self) def __str__(self) -> str: return ( f"Variant(variant_info={self._variant_info}, " f"tx_annotations={self._tx_annotations}, " f"genotypes={self._gts})" )