from __future__ import annotations
import logging
from copy import deepcopy
from dataclasses import dataclass
from typing import Iterable
from typing import List
from typing import Set
from typing import Type
from django.conf import settings
from django.utils.functional import classproperty
from rest_framework.serializers import ModelSerializer
from common.models import TrackedModel
from common.validators import UpdateType
from importer.nursery import TariffObjectNursery
from importer.utils import DispatchedObjectType
from importer.utils import LinksType
from importer.utils import generate_key
from taric_parsers.importer_issue import ImportIssueReportItem
logger = logging.getLogger(__name__)
[docs]@dataclass
class DependencyMappingData:
"""
Data class for temporarily storing dependency data while checking and
reporting issues. It is used to populate the issue report if issues are
detected.
params:
key: string, required.
The string used to identify the expected cached key for the dependency
tag: string, required.
the string of the tag that is used in TARIC3 to represent the object type of the dependency
identifying_fields: dict, required.
A dictionary of the identity fields and values defined for the current object
data: dict, required.
A dictionary if all associated data for the current object
"""
key: str
tag: str
identifying_fields: Iterable[str]
data: dict
[docs]class MismatchedSerializerError(Exception):
pass
[docs]class BaseHandler(metaclass=BaseHandlerMeta):
"""
The Base class for import handlers.
Handlers are designed to build objects which are then ready to be entered into the database.
This effectively takes place in 8 stages:
Init:
1. The handler is initialised with the initial data.
Build:
2. The handler checks for dependencies and links. If there are none it goes to step 5.
3. The handler searches for dependencies which may contain extra required data. If any can't be found it
asks to be cached and resolved later, the process stops. If they are found it unifies the data.
4. The handler searches for any links (foreign keys) that it needs. If any can't be found it asks to be
cached and resolved later, the process stops. If they are found it stores them.
Dispatch:
5. The handler validates the complete data against the serializer.
6. The handler runs any pre-save processing, including adding the foreign keys to the validated data.
7. The handler saves the object to the database.
8. The handler runs any post-save processing.
Many models are likely to have some specific requirements and so customisation is a focus within this system.
But many use cases should also be workable with just the base.
Most steps within this process can be customised and overridden - every model is likely to have some
specific custom demands.
A few examples of different scenarios follow below.
**Example 1**
Simple object, no dependencies or links.
For very simple objects there should be almost no work to do, assuming the data comes in clean without
any need for editing. In this case it should be enough to simply define a handler like so:
.. code:: python
class SimpleObjectHandler(BaseHandler):
serializer_class = serializers.SimpleObjectSerializer
tag = parsers.SimpleObjectParser.tag.name
Any object like this would be immediately processed when run through the nursery as, without any dependencies
or links, there should be nothing it needs to wait on.
**Example 2**
An object with dependencies:
.. code:: python
class DependentModelAHandler(BaseHandler):
serializer_class = serializers.DependentModelSerializer
tag = parser.DependentModelAParser.tag.name
@DependentModelAHandler.register_dependant
class DependentModelBHandler(BaseHandler):
dependencies = [DependentModelAHandler]
serializer_class = serializers.DependentModelSerializer
tag = parser.DependentModelBParser.tag.name
Dependencies in this case means two pre-existing models have been merged into one. Sadly the data import
doesn't account for this and therefore it is expected to receive the complete set of data for this object
over several records. A handler must therefore be created for each expected record type. All records will
then be stored in the nursery cache until they can be collected together for processing.
As dependencies are supposed to all relate to the same model, they must all share the same serializer.
Dependencies cascade. Consequentially if object A depends on object B, but object B depends on object C,
then object A will also depend on object C. However, given the nature of the dependencies, it is more
desirable to have all handlers be explicit about all dependencies they may rely on.
Dependencies are assigned at class level. Due to the way compilation works this creates issues with
forward-referencing. To handle this there are two mechanisms for registering dependencies. The first
is simply a class level dependency list with the dependent classes within them. The latter is with a
decorator which allows a class to decorate itself - therefore inserting itself into the aforementioned
dependency list of a pre-existing class.
With this defined the nursery will collect the data for each dependency. The dependencies can then query
the nursery to collect all the data. If all the data is found the object is dispatched to the database
and the nursery removes the relevant data from the cache.
**Example 3**
An object with Foreign Key links.
.. code:: python
class LinkedObjectHandler(BaseHandler):
links = (
{
"model": models.LinkToModelA,
"name": "link_to_model_a",
"optional": True,
},
{
"model": models.LinkToModelB,
"name": "link_to_model_b",
"optional": False,
"identifying_fields": ("some_field", "some_other_model_id")
},
)
def get_link_to_model_b_link(self, model, kwargs):
other_model = models.SomeOtherModel(field2=kwargs.get(some_other_model_id)
return model.objects.get_latest_version(other_model=other_model, **kargs)
Foreign key links are more flexible than dependencies. Whilst dependencies denote records which hold
part of the data for an object, links denote pre-existing objects which the current object needs to link
to. The difficulty here is there is no guarantee that the pre-existing object has actually been created
yet, nor that the object necessarily _needs_ to exist (the foreign key could be nullable).
Therefore Handlers have the option of adding a `links` attribute, which should be an iterable of `LinksType`
style dictionaries. This must define two keys:
1. model - which is expected to be a `TrackedModel` instance
2. name - a string, which is how the link data will be differentiated from the object data, as well as
how it will be named in the data when saved to the database. More specifically incoming data for the
linked field (specifically the models identifying fields) is expected to be prefixed with this name.
So the parser must define fields with this prefix.
Two other optional keys exist:
3. optional - defines whether a link is optional. If it is optional the object will be saved even if the
link can't be found. If it is not optional then the object will be cached until the link can be found.
4. identifying_fields - On occasion the identifying fields from model.identifying_fields may not be
appropriate, in this case they can be overridden here.
With just these the Handler will automatically try to fetch the linked model with the identifying fields of
the model (or those given in the link dictionary). Once fetched it will store the data with the given name.
Once ready to save the links will be added to the object data with the given name and the object will be saved.
In some cases this is not enough and further customisation is needed when fetching links. An example is a link
where one of the identifying fields is a foreign key on that linked field (i.e. the linked field itself has
another link). To allow for this a method can be added to fetch the link appropriately. This method must be named
`get_{link_name}_link` and accept the arguments `model` and `kwargs`. This must then return whatever object is
intended to go into the data with the same name as that given to the link.
All of the above examples can be used together, e.g. a handler can have both dependencies and links.
"""
dependencies: List[Type[BaseHandler]] = None
identifying_fields: Iterable[str] = None
links: Iterable[LinksType] = None
serializer_class: Type[ModelSerializer] = None
tag: str = None
dependency_key_mapping: List[DependencyMappingData] = list()
import_issues: List[ImportIssueReportItem] = list()
dependency_keys: List[str] = list()
def __init__(
self,
dispatched_object: DispatchedObjectType,
nursery: TariffObjectNursery,
):
self.nursery = nursery
self.data = dispatched_object["data"]
if not self.identifying_fields:
self.identifying_fields = self.model.identifying_fields
self.transaction_id = dispatched_object["transaction_id"]
self.key = generate_key(
tag=self.tag,
identifying_fields=self.identifying_fields,
data=self.data,
)
self.dependency_keys = self._generate_dependency_keys()
self.resolved_links = {}
def _generate_dependency_keys(self) -> Set[str]:
"""
Objects are stored in the cache using unique but identifiable IDs. Any
dependant object must be able to figure out all the keys for its
dependencies.
This method fetches all the dependencies, builds their keys and then
returns the keys as a set.
"""
depends_on = set()
if not self.dependencies:
return depends_on
for dependency in self.dependencies:
if dependency.serializer_class != self.serializer_class:
raise MismatchedSerializerError(
f"Dependent parsers must have the same serializer_class as their dependencies. "
f"Dependency {dependency.__name__} has "
f"serializer_class {dependency.serializer_class.__name__}. "
f"{self.__class__.__name__} has serializer_class {self.serializer_class.__name__}.",
)
key = generate_key(
tag=dependency.tag,
identifying_fields=self.identifying_fields,
data=self.data,
)
# using this to allow the correct mapping to be used later when reporting back to user
self.dependency_key_mapping.append(
DependencyMappingData(
key,
dependency.tag,
self.identifying_fields,
self.data,
),
)
depends_on.add(key)
return depends_on
[docs] def resolve_dependencies(self) -> bool:
"""
Search the cache for all object dependencies and attempt to resolve
them.
Previously found objects, which are dependent on the current object, should be
stored in the cache. This method loops over the current objects dependencies and
attempts to extract them from the cache. It then also searches for the dependencies
of the extracted objects.
All found dependencies are then merged into the current objects data. If at any point
a dependency is not found the method returns False - to signify the object cannot be resolved.
If all dependencies are found the method returns True.
"""
dependencies = self.dependency_keys.copy()
resolved_dependencies = {self.key}
while dependencies:
key = dependencies.pop()
dependency = self.nursery.get_handler_from_cache(key)
if not dependency:
return False
self.data.update(dependency.data)
resolved_dependencies.add(key)
dependencies.update(set(dependency.dependency_keys) - resolved_dependencies)
return True
def _get_missing_dependencies(self) -> list[str]:
"""
Returns a list of dependencies that are not in the cache.
returns:
list(str).
A list of dependency keys expected / required for the current object but are not in cache.
"""
dependencies = self.dependency_keys.copy()
missing_dependency_keys = []
while dependencies:
key = dependencies.pop()
dependency = self.nursery.get_handler_from_cache(key)
if not dependency:
missing_dependency_keys.append(key)
return missing_dependency_keys
[docs] def get_generic_link(self, model, kwargs):
"""
Fallback method if no specific method is found for fetching a link.
Raises DoesNotExist if no kwargs passed.
First attempts to retrieve the object PK from the cache (saves queries).
If this is not found a database query is made to find the object.
returns tuple(Object: model, bool: From Cache)
"""
if not kwargs.values():
raise model.DoesNotExist
if settings.USE_IMPORTER_CACHE:
cached_object = self.nursery.get_obj_from_cache(
model,
kwargs.keys(),
kwargs,
)
if cached_object and cached_object[1] == model.__name__:
return cached_object[0], True
try:
if self.data["update_type"] == UpdateType.DELETE:
return (
model.objects.get_versions(**kwargs).latest_deleted().get(),
False,
)
return model.objects.get_latest_version(**kwargs), False
except model.DoesNotExist as e:
if self.data["update_type"] == UpdateType.DELETE:
return model.objects.get_latest_version(**kwargs), False
raise e
[docs] def load_link(self, name, model, identifying_fields=None, optional=False):
"""
Load a given link for a handler.
This method first attempts to find any custom method existing on the handler
for finding the specific link. The custom method must be named:
get_{LINK_NAME}_link
If no custom method is found then :py:meth:`.BaseHandler.get_generic_link` is used.
If no object matching the given link is found and the link is non-optional then a
DoesNotExist error is raised.
"""
identifying_fields = identifying_fields or model.identifying_fields
try:
linked_object_identifiers = {
key: self.data.get(f"{name}__{key}") for key in identifying_fields
}
get_link_func = getattr(self, f"get_{name}_link", self.get_generic_link)
linked_object = get_link_func(model, linked_object_identifiers)
if isinstance(linked_object, tuple) and len(linked_object) > 1:
if linked_object[1]:
self.resolved_links[f"{name}_id"] = linked_object[0]
else:
self.resolved_links[name] = linked_object[0]
else:
self.resolved_links[name] = linked_object
except model.DoesNotExist:
if not optional:
return False
return True
[docs] def resolve_links(self) -> bool:
"""
Extract data specific to links and use this to search the database for
the relevant objects.
Once found attach the object to the `resolved_links` dictionary.
If a non-optional object can't be found return False. This signifies the links cannot be resolved.
If all non-optional objects are found then return True.
"""
if not self.links:
return True
for link in self.links:
if not self.load_link(**link):
return False
return True
[docs] def clean(self, data: dict) -> dict:
"""Validate the data against the serializer and return the validated
data."""
serializer = self.serializer_class(data=data)
serializer.is_valid(raise_exception=True)
return serializer.validated_data
[docs] def pre_save(self, data: dict, links: dict) -> dict:
"""
Pre-processing before the object is saved to the database.
Generally this is used for adding the links to the object (as these
cannot be easily validated against the serializer).
Return the final dataset to be used when saving to the database.
"""
data = deepcopy(data)
data.update(**links)
return data
def save(self, data: dict):
return self.serializer_class().create(data)
[docs] def post_save(self, obj):
"""
Post-processing after the object has been saved to the database.
By default this caches any new saved object.
"""
self.nursery.cache_object(obj)
[docs] def build(self) -> Set[str]:
"""
Build up all the data for the object.
This method co-ordinates the attempts to fetch the dependent data as
well as the linked data. If at any point one of these steps fails an
empty set returns (signifying failure).
if all steps are deemed successful the object is dispatched to the
database automatically. On success a set of all the keys for any objects
used which may be in the cache is returned.
"""
if not self.dependency_keys and not self.links:
self.dispatch()
return {self.key}
if not self.resolve_dependencies() or not self.resolve_links():
return set()
self.dispatch()
self.dependency_keys.add(self.key)
return self.dependency_keys
[docs] def get_import_issues(self):
"""
Iterates through missing dependencies and returns a list of missing
dependencies as a list of ImportIssueReportItems objects.
This is later used for handling known issues with missing dependencies
GoodsNomenclatureDescriptionOPeriod for example.
"""
if not self.resolve_dependencies():
# generic error - can do better to resolve later
dep_missing_details_dict = {}
for key in self._get_missing_dependencies():
missing_dependency_data = self._get_dependency_key_data(key)
dep_missing_details = (
f"dependency missing of type {missing_dependency_data.tag}, "
)
for index, field in enumerate(
missing_dependency_data.identifying_fields,
):
dep_missing_details_dict[field] = missing_dependency_data.data[
field
]
self.import_issues.append(
ImportIssueReportItem(
self.tag,
missing_dependency_data.tag,
dep_missing_details_dict,
key,
dep_missing_details,
),
)
return self.import_issues
def _get_dependency_key_data(self, key: str):
"""
Returns the matching DependencyMappingData object based on the provided
key.
params:
key, str.
key used to identify / store the object in cache
"""
for dep in self.dependency_key_mapping:
if dep.key == key:
return dep
return None
[docs] def dispatch(self) -> TrackedModel:
"""
Save the data into the database.
This method initially validates all collected data. If valid it then
runs some pre-processing before saving. Once saved an opportunity is
given for post-processing.
"""
data = self.clean(self.data)
data.update(transaction_id=self.transaction_id)
logger.debug(f"Creating {self.model}: {data}")
data = self.pre_save(data, self.resolved_links)
obj = self.save(data)
self.post_save(obj)
return obj
[docs] def serialize(self) -> DispatchedObjectType:
"""
Provides a serializable dict of the object to be stored in the cache.
Should hold all the necessary data required to rebuild the object
"""
return {
"data": self.data,
"tag": self.tag,
"transaction_id": self.transaction_id,
}
[docs] @classmethod
def register_dependant(cls, dependant: Type[BaseHandler]):
"""
Allow a handler to retrospectively assign itself to another handlers
list of dependencies.
This solves the issue of forward referencing - where a class cannot reference another class
before that class has been defined.
"""
if not cls.dependencies:
cls.dependencies = [dependant]
else:
cls.dependencies.append(dependant)
return dependant
@classproperty
def model(self) -> Type[TrackedModel]:
return self.serializer_class.Meta.model