Source code for gpsea.preprocessing._patient

import abc
import dataclasses
import typing

from stairval.notepad import Notepad

from gpsea.model import Patient, Cohort

T = typing.TypeVar("T")
"""
The input for `PatientCreator`.

It can be any object that contains the patient data (e.g. a phenopacket).
"""


[docs] class PatientCreator(typing.Generic[T], metaclass=abc.ABCMeta): """ `PatientCreator` can create a `Patient` from some input `T`. """
[docs] @abc.abstractmethod def process( self, item: T, notepad: Notepad, ) -> typing.Optional[Patient]: pass
[docs] @dataclasses.dataclass() class CohortCreatorOptions: """ Options for :class:`~gpsea.preprocessing.CohortCreator`. """ keep_individuals_with_no_hpo: bool = False keep_individuals_with_no_variants: bool = False
[docs] class CohortCreator(typing.Generic[T]): """ `CohortCreator` creates a cohort from the provided `inputs`, subjecting the cohort members to Q/C and filtering. Cohort creator uses :class:`~gpsea.preprocessing.PatientCreator` to map each cohort member into a :class:`~gpsea.model.Patient`. The cohort creator is generic over the cohort member type `T` and all that matters is if the inner :class:`~gpsea.preprocessing.PatientCreator` can map `T` into a :class:`~gpsea.model.Patient`. **Q/C checks** The members are checked for duplicates and the duplicates are reported into the `notepad`. The `notepad` also retains the issues found by :class:`~gpsea.preprocessing.PatientCreator`. Note, a cohort *is* created even in presence of Q/C errors. **Filtering** The following filters are applied after mapping `T` to cohort members: * filter out the individuals who have 0 phenotypes, controlled by :class:`gpsea.preprocessing.CohortCreatorOptions.keep_individuals_with_no_hpo` * filter out the individuals who have 0 variants, controlled by :class:`gpsea.preprocessing.CohortCreatorOptions.keep_individuals_with_no_variants` **Cohort member order** Cohort creator guarantees stable order of the cohort members, i.e. iterating over :meth:`~gpsea.model.Cohort` yields the cohort members in the same order as they were seen in the `inputs` iterable. :param patient_creator: an instance of :class:`~gpsea.preprocessing.PatientCreator` to map `T` into :class:`~gpsea.model.Patient`. :param options: cohort creator options or `None` if default options should be used. """ def __init__( self, patient_creator: PatientCreator[T], options: typing.Optional[CohortCreatorOptions] = None, ): # Check that we're getting a `PatientCreator`. # Unfortunately, we cannot check that `T`s of `PatientCreator` and `CohortCreator` actually match # due to Python's loosey-goosey nature. assert isinstance(patient_creator, PatientCreator) self._pc = patient_creator if options is None: self._options = CohortCreatorOptions() else: assert isinstance(options, CohortCreatorOptions) self._options = options
[docs] def process( self, inputs: typing.Iterable[T], notepad: Notepad, ) -> Cohort: """ Process the `inputs` into a :class:`~gpsea.model.Cohort` and write any Q/C issues into the `notepad`. """ patients = [] patient_labels = set() duplicate_pat_labels = set() for i, pp in enumerate(inputs): sub = notepad.add_subsection(f"patient #{i}") patient = self._pc.process(pp, sub) if patient is not None: if patient.labels in patient_labels: duplicate_pat_labels.add(patient.labels) patient_labels.add(patient.labels) patients.append(patient) if len(duplicate_pat_labels) > 0: label_summaries = [d.label_summary() for d in duplicate_pat_labels] label_summaries.sort() notepad.add_error( f"Patient ID/s {', '.join(label_summaries)} have a duplicate", "Please verify every patient has an unique ID.", ) return Cohort.from_patients( members=filter(self._passes_filtering, patients), )
def _passes_filtering( self, patient: Patient, ) -> bool: if ( len(patient.phenotypes) == 0 and not self._options.keep_individuals_with_no_hpo ): return False elif ( len(patient.variants) == 0 and not self._options.keep_individuals_with_no_variants ): return False else: return True