Source code for importer.handlers

from __future__ import annotations

import logging
from copy import deepcopy
from dataclasses import dataclass
from typing import Iterable
from typing import List
from typing import Set
from typing import Type

from django.conf import settings
from django.utils.functional import classproperty
from rest_framework.serializers import ModelSerializer

from common.models import TrackedModel
from common.validators import UpdateType
from importer.nursery import TariffObjectNursery
from importer.utils import DispatchedObjectType
from importer.utils import LinksType
from importer.utils import generate_key
from taric_parsers.importer_issue import ImportIssueReportItem

logger = logging.getLogger(__name__)


[docs]@dataclass class DependencyMappingData: """ Data class for temporarily storing dependency data while checking and reporting issues. It is used to populate the issue report if issues are detected. params: key: string, required. The string used to identify the expected cached key for the dependency tag: string, required. the string of the tag that is used in TARIC3 to represent the object type of the dependency identifying_fields: dict, required. A dictionary of the identity fields and values defined for the current object data: dict, required. A dictionary if all associated data for the current object """ key: str tag: str identifying_fields: Iterable[str] data: dict
[docs]class MismatchedSerializerError(Exception): pass
[docs]class BaseHandlerMeta(type): """ BaseHandler Metaclass to add validation and registration of each new Handler class. Handlers have relatively strict requirements for them to function. Firstly there are two required attributes: 1. "tag" - a string which is what matches the handler against the incoming data. 2. "serializer_class" - a ModelSerializer which is used to validate and create the database object. Without these attributes the class cannot function properly. To ensure these are defined the metaclass checks for their existence and type. If they aren't properly defined then an error is raised at compile time (i.e. on import). The second requirement is that handlers need to be attached to the nursery. This is so that the nursery can match them against the incoming data. To accommodate this all handlers are automatically registered with the nursery class once validated. This reduces boilerplate code. """ def __new__(cls, name: str, bases: tuple, dct: dict): handler_class = super().__new__(cls, name, bases, dct) if not bases: # This is a top level class, we only want to register and validate subclasses return handler_class if dct.get("abstract"): return handler_class handler_class.abstract = False if not isinstance(getattr(handler_class, "tag"), str): raise AttributeError(f'{name} requires attribute "tag" to be a str.') if not issubclass( getattr(handler_class, "serializer_class", type), ModelSerializer, ): raise AttributeError( f'{name} requires attribute "serializer_class" to be a subclass of "ModelSerializer".', ) TariffObjectNursery.register_handler(handler_class) return handler_class
[docs]class BaseHandler(metaclass=BaseHandlerMeta): """ The Base class for import handlers. Handlers are designed to build objects which are then ready to be entered into the database. This effectively takes place in 8 stages: Init: 1. The handler is initialised with the initial data. Build: 2. The handler checks for dependencies and links. If there are none it goes to step 5. 3. The handler searches for dependencies which may contain extra required data. If any can't be found it asks to be cached and resolved later, the process stops. If they are found it unifies the data. 4. The handler searches for any links (foreign keys) that it needs. If any can't be found it asks to be cached and resolved later, the process stops. If they are found it stores them. Dispatch: 5. The handler validates the complete data against the serializer. 6. The handler runs any pre-save processing, including adding the foreign keys to the validated data. 7. The handler saves the object to the database. 8. The handler runs any post-save processing. Many models are likely to have some specific requirements and so customisation is a focus within this system. But many use cases should also be workable with just the base. Most steps within this process can be customised and overridden - every model is likely to have some specific custom demands. A few examples of different scenarios follow below. **Example 1** Simple object, no dependencies or links. For very simple objects there should be almost no work to do, assuming the data comes in clean without any need for editing. In this case it should be enough to simply define a handler like so: .. code:: python class SimpleObjectHandler(BaseHandler): serializer_class = serializers.SimpleObjectSerializer tag = parsers.SimpleObjectParser.tag.name Any object like this would be immediately processed when run through the nursery as, without any dependencies or links, there should be nothing it needs to wait on. **Example 2** An object with dependencies: .. code:: python class DependentModelAHandler(BaseHandler): serializer_class = serializers.DependentModelSerializer tag = parser.DependentModelAParser.tag.name @DependentModelAHandler.register_dependant class DependentModelBHandler(BaseHandler): dependencies = [DependentModelAHandler] serializer_class = serializers.DependentModelSerializer tag = parser.DependentModelBParser.tag.name Dependencies in this case means two pre-existing models have been merged into one. Sadly the data import doesn't account for this and therefore it is expected to receive the complete set of data for this object over several records. A handler must therefore be created for each expected record type. All records will then be stored in the nursery cache until they can be collected together for processing. As dependencies are supposed to all relate to the same model, they must all share the same serializer. Dependencies cascade. Consequentially if object A depends on object B, but object B depends on object C, then object A will also depend on object C. However, given the nature of the dependencies, it is more desirable to have all handlers be explicit about all dependencies they may rely on. Dependencies are assigned at class level. Due to the way compilation works this creates issues with forward-referencing. To handle this there are two mechanisms for registering dependencies. The first is simply a class level dependency list with the dependent classes within them. The latter is with a decorator which allows a class to decorate itself - therefore inserting itself into the aforementioned dependency list of a pre-existing class. With this defined the nursery will collect the data for each dependency. The dependencies can then query the nursery to collect all the data. If all the data is found the object is dispatched to the database and the nursery removes the relevant data from the cache. **Example 3** An object with Foreign Key links. .. code:: python class LinkedObjectHandler(BaseHandler): links = ( { "model": models.LinkToModelA, "name": "link_to_model_a", "optional": True, }, { "model": models.LinkToModelB, "name": "link_to_model_b", "optional": False, "identifying_fields": ("some_field", "some_other_model_id") }, ) def get_link_to_model_b_link(self, model, kwargs): other_model = models.SomeOtherModel(field2=kwargs.get(some_other_model_id) return model.objects.get_latest_version(other_model=other_model, **kargs) Foreign key links are more flexible than dependencies. Whilst dependencies denote records which hold part of the data for an object, links denote pre-existing objects which the current object needs to link to. The difficulty here is there is no guarantee that the pre-existing object has actually been created yet, nor that the object necessarily _needs_ to exist (the foreign key could be nullable). Therefore Handlers have the option of adding a `links` attribute, which should be an iterable of `LinksType` style dictionaries. This must define two keys: 1. model - which is expected to be a `TrackedModel` instance 2. name - a string, which is how the link data will be differentiated from the object data, as well as how it will be named in the data when saved to the database. More specifically incoming data for the linked field (specifically the models identifying fields) is expected to be prefixed with this name. So the parser must define fields with this prefix. Two other optional keys exist: 3. optional - defines whether a link is optional. If it is optional the object will be saved even if the link can't be found. If it is not optional then the object will be cached until the link can be found. 4. identifying_fields - On occasion the identifying fields from model.identifying_fields may not be appropriate, in this case they can be overridden here. With just these the Handler will automatically try to fetch the linked model with the identifying fields of the model (or those given in the link dictionary). Once fetched it will store the data with the given name. Once ready to save the links will be added to the object data with the given name and the object will be saved. In some cases this is not enough and further customisation is needed when fetching links. An example is a link where one of the identifying fields is a foreign key on that linked field (i.e. the linked field itself has another link). To allow for this a method can be added to fetch the link appropriately. This method must be named `get_{link_name}_link` and accept the arguments `model` and `kwargs`. This must then return whatever object is intended to go into the data with the same name as that given to the link. All of the above examples can be used together, e.g. a handler can have both dependencies and links. """ dependencies: List[Type[BaseHandler]] = None identifying_fields: Iterable[str] = None links: Iterable[LinksType] = None serializer_class: Type[ModelSerializer] = None tag: str = None dependency_key_mapping: List[DependencyMappingData] = list() import_issues: List[ImportIssueReportItem] = list() dependency_keys: List[str] = list() def __init__( self, dispatched_object: DispatchedObjectType, nursery: TariffObjectNursery, ): self.nursery = nursery self.data = dispatched_object["data"] if not self.identifying_fields: self.identifying_fields = self.model.identifying_fields self.transaction_id = dispatched_object["transaction_id"] self.key = generate_key( tag=self.tag, identifying_fields=self.identifying_fields, data=self.data, ) self.dependency_keys = self._generate_dependency_keys() self.resolved_links = {} def _generate_dependency_keys(self) -> Set[str]: """ Objects are stored in the cache using unique but identifiable IDs. Any dependant object must be able to figure out all the keys for its dependencies. This method fetches all the dependencies, builds their keys and then returns the keys as a set. """ depends_on = set() if not self.dependencies: return depends_on for dependency in self.dependencies: if dependency.serializer_class != self.serializer_class: raise MismatchedSerializerError( f"Dependent parsers must have the same serializer_class as their dependencies. " f"Dependency {dependency.__name__} has " f"serializer_class {dependency.serializer_class.__name__}. " f"{self.__class__.__name__} has serializer_class {self.serializer_class.__name__}.", ) key = generate_key( tag=dependency.tag, identifying_fields=self.identifying_fields, data=self.data, ) # using this to allow the correct mapping to be used later when reporting back to user self.dependency_key_mapping.append( DependencyMappingData( key, dependency.tag, self.identifying_fields, self.data, ), ) depends_on.add(key) return depends_on
[docs] def resolve_dependencies(self) -> bool: """ Search the cache for all object dependencies and attempt to resolve them. Previously found objects, which are dependent on the current object, should be stored in the cache. This method loops over the current objects dependencies and attempts to extract them from the cache. It then also searches for the dependencies of the extracted objects. All found dependencies are then merged into the current objects data. If at any point a dependency is not found the method returns False - to signify the object cannot be resolved. If all dependencies are found the method returns True. """ dependencies = self.dependency_keys.copy() resolved_dependencies = {self.key} while dependencies: key = dependencies.pop() dependency = self.nursery.get_handler_from_cache(key) if not dependency: return False self.data.update(dependency.data) resolved_dependencies.add(key) dependencies.update(set(dependency.dependency_keys) - resolved_dependencies) return True
def _get_missing_dependencies(self) -> list[str]: """ Returns a list of dependencies that are not in the cache. returns: list(str). A list of dependency keys expected / required for the current object but are not in cache. """ dependencies = self.dependency_keys.copy() missing_dependency_keys = [] while dependencies: key = dependencies.pop() dependency = self.nursery.get_handler_from_cache(key) if not dependency: missing_dependency_keys.append(key) return missing_dependency_keys
[docs] def clean(self, data: dict) -> dict: """Validate the data against the serializer and return the validated data.""" serializer = self.serializer_class(data=data) serializer.is_valid(raise_exception=True) return serializer.validated_data
[docs] def pre_save(self, data: dict, links: dict) -> dict: """ Pre-processing before the object is saved to the database. Generally this is used for adding the links to the object (as these cannot be easily validated against the serializer). Return the final dataset to be used when saving to the database. """ data = deepcopy(data) data.update(**links) return data
def save(self, data: dict): return self.serializer_class().create(data)
[docs] def post_save(self, obj): """ Post-processing after the object has been saved to the database. By default this caches any new saved object. """ self.nursery.cache_object(obj)
[docs] def build(self) -> Set[str]: """ Build up all the data for the object. This method co-ordinates the attempts to fetch the dependent data as well as the linked data. If at any point one of these steps fails an empty set returns (signifying failure). if all steps are deemed successful the object is dispatched to the database automatically. On success a set of all the keys for any objects used which may be in the cache is returned. """ if not self.dependency_keys and not self.links: self.dispatch() return {self.key} if not self.resolve_dependencies() or not self.resolve_links(): return set() self.dispatch() self.dependency_keys.add(self.key) return self.dependency_keys
[docs] def get_import_issues(self): """ Iterates through missing dependencies and returns a list of missing dependencies as a list of ImportIssueReportItems objects. This is later used for handling known issues with missing dependencies GoodsNomenclatureDescriptionOPeriod for example. """ if not self.resolve_dependencies(): # generic error - can do better to resolve later dep_missing_details_dict = {} for key in self._get_missing_dependencies(): missing_dependency_data = self._get_dependency_key_data(key) dep_missing_details = ( f"dependency missing of type {missing_dependency_data.tag}, " ) for index, field in enumerate( missing_dependency_data.identifying_fields, ): dep_missing_details_dict[field] = missing_dependency_data.data[ field ] self.import_issues.append( ImportIssueReportItem( self.tag, missing_dependency_data.tag, dep_missing_details_dict, key, dep_missing_details, ), ) return self.import_issues
def _get_dependency_key_data(self, key: str): """ Returns the matching DependencyMappingData object based on the provided key. params: key, str. key used to identify / store the object in cache """ for dep in self.dependency_key_mapping: if dep.key == key: return dep return None
[docs] def dispatch(self) -> TrackedModel: """ Save the data into the database. This method initially validates all collected data. If valid it then runs some pre-processing before saving. Once saved an opportunity is given for post-processing. """ data = self.clean(self.data) data.update(transaction_id=self.transaction_id) logger.debug(f"Creating {self.model}: {data}") data = self.pre_save(data, self.resolved_links) obj = self.save(data) self.post_save(obj) return obj
[docs] def serialize(self) -> DispatchedObjectType: """ Provides a serializable dict of the object to be stored in the cache. Should hold all the necessary data required to rebuild the object """ return { "data": self.data, "tag": self.tag, "transaction_id": self.transaction_id, }
[docs] @classmethod def register_dependant(cls, dependant: Type[BaseHandler]): """ Allow a handler to retrospectively assign itself to another handlers list of dependencies. This solves the issue of forward referencing - where a class cannot reference another class before that class has been defined. """ if not cls.dependencies: cls.dependencies = [dependant] else: cls.dependencies.append(dependant) return dependant
@classproperty def model(self) -> Type[TrackedModel]: return self.serializer_class.Meta.model